def print_results_for_field(dataset, field, prefix): dataset = drop_na(dataset, field) dataset_notrea_dataset = drop_trea(dataset) print_order = [ "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i in range(100): ds = get_balanced_split(dataset, field) rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["full_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) ds = get_balanced_split(dataset_notrea_dataset, field) rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["notrea_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) # for order in print_order: # print(order, " "*(max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) # print ("") for order in print_order: print("==> ", field, prefix, order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order])) atreat_dataset = read_alltreat_dataset() pam_types_cat_dataset = read_pam_types_cat_dataset() notrea_dataset = drop_trea(pam_types_cat_dataset) #Variant 1 study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5 set1 = atreat_dataset.loc[ (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15") & (atreat_dataset['treatment_protocol_number'] == '1')]['patient_ID'] set2 = atreat_dataset.loc[ (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15") & (atreat_dataset['treatment_protocol_number'] == '5')]['patient_ID'] print("==> study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5") print_results(notrea_dataset, set1, set2) print("==>") #Variant 2 study_9893_GPL5049_all-bmc15 protocol 1 vs protocol 2
print("stack") print_mean_fold_importance(np.concatenate([X_set1, X_set2]), np.concatenate([y_set1, y_set2]), genes_list) X_set1_wf = add_one_features_tail(X_set1, 0) X_set2_wf = add_one_features_tail(X_set2, 1) print("stack_with_set") print_mean_fold_importance(np.concatenate([X_set1_wf, X_set2_wf]), np.concatenate([y_set1, y_set2]), genes_list) atreat_dataset = read_alltreat_dataset() combat_dataset = read_combat_dataset() notrea_dataset = drop_trea(combat_dataset) #dataset = read_pam_types_num_dataset() #notrea_dataset = drop_trea(dataset) #Variant 1 study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5 set1 = atreat_dataset.loc[ (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15") & (atreat_dataset['treatment_protocol_number'] == '1')]['patient_ID'] set2 = atreat_dataset.loc[ (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15") & (atreat_dataset['treatment_protocol_number'] == '5')]['patient_ID'] print("==> study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5") print_results(notrea_dataset, set1, set2) print("==>")
acc = np.mean(y_test == y_pred) recall_0 = recall_score(y_test, y_pred, pos_label=0) recall_1 = recall_score(y_test, y_pred, pos_label=1) return acc, recall_0, recall_1 def count_to_str(y): c = Counter(y) return "count_01=%i/%i" % (c[0], c[1]) full_dataset = read_full_dataset() mike_dataset = read_mike_dataset() treat_dataset = read_treat_dataset() full_notrea_dataset = drop_trea(full_dataset) mike_notrea_dataset = drop_trea(mike_dataset) all_studies = list(set(full_dataset['study'])) print_order = [ "full", "full_notrea", "full_pam50", "mike", "mike_svm", "mike_logi", "mike_notrea", "mike_notrea_svm", "mike_notrea_logi", "mike_pam50", "mike_pam50_svm", "mike_pam50_logi", "trea", "trea_svm", "trea_logi" ] max_len_order = max(map(len, print_order)) for study in ['study_20194_GPL96_all-bmc15']: #for study in ['study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen-bmc15']: X_full, y_full = prepare_dataset(full_dataset, study)
import pandas as pd import os import numpy as np import random import math from funs_common import read_mike_dataset, drop_trea, prepare_full_dataset from sklearn.manifold import TSNE import pickle dataset = read_mike_dataset() notrea_dataset = drop_trea(dataset) X, y = prepare_full_dataset(notrea_dataset, y_field='study') X_embedded = TSNE(n_components=2).fit_transform(X) pickle.dump(X_embedded, open("experement24_studytest_tsne.p", "wb"))
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1) rez = [] for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): X_train, X_test = X_full[train_index], X_full[test_index] y_train, y_test = y_full[train_index], y_full[test_index] rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier())) print(list2d_to_4g_str_pm(rez)) return np.mean(np.array(rez)[:,0]) dataset = read_combat_dataset() dataset = drop_trea(dataset) all_studies = list(set(dataset['study'])) all_rez = [] for s1,s2 in itertools.combinations(sorted(all_studies), 2): ds1 = dataset.loc[dataset['study'] == s1] ds2 = dataset.loc[dataset['study'] == s2] print("studies: ", s1, s2, len(ds1), len(ds2)) rez = print_results_get_mean_acc(ds1, ds2) all_rez.append((rez, s1, s2)) print("\n--------------------------------------\n") for rez,s1,s2 in sorted(all_rez): if (rez < 0.99):
print("==> DFS", prefix) print_results_for_field(dataset, Xt_full_dict, "DFS", prefix) print("") print("") print("==> posOutcome", prefix) print_results_for_field(dataset, Xt_full_dict, "posOutcome", prefix) print("") print("") treat_dataset = read_treat_dataset() combat_dataset = read_combat_dataset() pam_types_cat_dataset = read_pam_types_cat_dataset() X_full, _ = prepare_full_dataset(drop_trea(combat_dataset)) assert all(pam_types_cat_dataset['patient_ID'] == combat_dataset['patient_ID']) #print(list(pam_types_cat_dataset)) #print_results(pam_types_cat_dataset, "old") for n_cluster in [1, 5, 10, 20, 100, 200]: lda = LatentDirichletAllocation(n_components=n_cluster) Xt_full = lda.fit_transform(X_full - np.min(X_full)) Xt_full_dict = { i: x for i, x in zip(combat_dataset['patient_ID'], Xt_full) } print_results(pam_types_cat_dataset, Xt_full_dict, "nc" + str(n_cluster))