def score_value(self, Y_test, y_pred, model_type, score_type): '''Computes different scores given options''' Y_test = Y_test.astype( float).values # make Y_test and y_pred same type if (score_type == 'auc' and model_type == 'absv'): TPR, FPR = du.roc_curve_adaboost(y_pred, Y_test) score_value = auc(FPR, TPR) elif (score_type == 'auc' and model_type != 'absv'): score_value = roc_auc_score(Y_test, y_pred) elif (score_type == 'acc'): score_value = accuracy_score(Y_test, y_pred) elif (score_type == 'prec'): score_value = precision_score(Y_test, y_pred) elif (score_type == 'f1'): score_value = f1_score(Y_test, y_pred) elif (score_type == 'rec'): score_value = recall_score(Y_test, y_pred) elif (score_type == 'gmean'): score_value = np.sqrt( precision_score(Y_test, y_pred) * recall_score(Y_test, y_pred)) return score_value
my_gamma_end=100, myKernel='rbf', myDegree=1, myCoef0=+1) # "trad-rbf-NOTdiv" start = datetime.datetime.now() model.fit(X_train, Y_train) end = datetime.datetime.now() elapsed_time = pd.DataFrame({"Elapsed time": [end - start]}) elapsed_time.to_csv("output/" + name + "/" + "AdaBoostSVM_time.csv", index=False) y_preda = model.predict(X_test) print("Final test accuracy: ", accuracy_score(Y_test, y_preda)) y_thresholds = model.decision_thresholds(X_test, glob_dec=True) TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test) prec = precision_score(Y_test, y_preda) print("Final test precision: ", prec) area = auc(FPR, TPR) print("Final test AUC: ", area) nWeaks = len(model.alphas) # print on plot no. classifiers dv.plot_roc_curve(TPR, FPR, name, "sorted", glob_local=True, name="nom", kernel=myKernel,
def bootstrap(sample_name, model, roc_area, selection, GA_mut=0.3, GA_score='', GA_selec='', GA_coef=0.5, n_cycles=1, split_frac=0.6, path='.'): # fetch data_frame without preparation data = data_preparation(path) sample_df_temp = data.fetch_data(sample_name) train_test = type( sample_df_temp) is tuple # are the data already splitted? if not train_test: sample_df = sample_df_temp else: sample_train_df, sample_test_df = sample_df_temp area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores = ( []), ([]), ([]), ([]), ([]), ([]), ([]) n_class_scores, n_train_scores = ([]), ([]) data_size = sample_df.shape[0] n_samples = int(split_frac * data_size) # bootstrap score calculations i_sample = 0 for _ in range( n_cycles): # arbitrary number of bootstrap samples to produce i_sample += 1 start = time.time() if not train_test: # train_test == True means we're given separate files for train and test sampled_data_train = resample(sample_df, replace=True, n_samples=n_samples, random_state=i_sample) if selection == 'trad': # test data are the complement of full input data that is not considered for training sampled_train_no_dup = sampled_data_train.drop_duplicates( keep=False) sampled_data_test = pd.concat( [sample_df, sampled_train_no_dup]).drop_duplicates(keep=False) X_train, Y_train = data.dataset(sample_name=sample_name, data_set=sampled_data_train, sampling=True, split_sample=0.0) X_test, Y_test = data.dataset(sample_name=sample_name, data_set=sampled_data_test, sampling=True, split_sample=0.0) elif selection == 'gene': # genetic selection train_indexes = sampled_data_train.index X, Y = data.dataset(sample_name=sample_name, data_set=sample_df, sampling=True) X_train, X_test, Y_train, Y_test = data.indexes_split( X, Y, split_indexes=train_indexes, train_test=train_test) print(len(X_train.index), len(Y_train.index), 'X_train, Y_train sizes') GA_selection = genetic_selection( model, roc_area, X_train, Y_train, X_test, Y_test, pop_size=10, chrom_len=int(len(Y_train.index) * 0.20), n_gen=50, coef=GA_coef, mut_rate=GA_mut, score_type=GA_score, selec_type=GA_selec) GA_selection.execute() GA_train_indexes = GA_selection.best_population() X_train, Y_train, X_test, Y_test = data.dataset( sample_name=sample_name, indexes=GA_train_indexes) else: sampled_data_train = resample(sample_train_df, replace=True, n_samples=5000, random_state=None) sampled_data_test = resample(sample_test_df, replace=True, n_samples=10000, random_state=None) X_train, Y_train, X_test, Y_test = data.dataset( sample_name=sample_name, data_set='', data_train=sampled_data_train, data_test=sampled_data_test, sampling=True, split_sample=0.4) model.fit(X_train, Y_train) n_base_class = 0 if (model.n_classifiers != 0): y_pred = model.predict(X_test) prec = precision_score(Y_test, y_pred) f1 = f1_score(Y_test, y_pred) recall = recall_score(Y_test, y_pred) acc = accuracy_score(Y_test, y_pred) gmean = np.sqrt(prec * recall) n_base_class = model.n_classifiers # calcualate roc-auc depending on the classifier if roc_area == "absv": y_thresholds = model.decision_thresholds(X_test, glob_dec=True) TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test) area = auc(FPR, TPR) model.clean() elif roc_area == "prob": Y_pred_prob = model.predict_proba(X_test)[:, 1] area = roc_auc_score(Y_test, Y_pred_prob) elif roc_area == "deci": Y_pred_dec = model.decision_function(X_test) area = roc_auc_score(Y_test, Y_pred_dec) end = time.time() time_scores = np.append(time_scores, end - start) area_scores = np.append(area_scores, area) prec_scores = np.append(prec_scores, prec) f1_scores = np.append(f1_scores, f1) recall_scores = np.append(recall_scores, recall) acc_scores = np.append(acc_scores, acc) gmean_scores = np.append(gmean_scores, gmean) n_class_scores = np.append(n_class_scores, n_base_class) n_train_scores = np.append(n_train_scores, len(X_train)) else: # this needs to be re-checked carefully end = time.time() time_scores = np.append(time_scores, end - start) area_scores = np.append(area_scores, 0) prec_scores = np.append(prec_scores, 0) f1_scores = np.append(f1_scores, 0) recall_scores = np.append(recall_scores, 0) acc_scores = np.append(acc_scores, 0) gmean_scores = np.append(gmean_scores, 0) n_class_scores = np.append(n_class_scores, 0) n_train_scores = np.append(n_train_scores, len(X_train)) return area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores, n_class_scores, n_class_scores, n_train_scores
def mcnemar_test(sample_name, selection='gene', model='no_div', train_test=False, GA_score='', GA_selec=''): if model == 'diverse': model1 = mm.adaboost_svm(True) elif model == 'no_div': model1 = mm.adaboost_svm(False) # fetch data data = data_preparation() X_train, Y_train, X_test, Y_test = \ data.dataset(sample_name=sample_name, sampling=False,split_sample=0.4) if selection == 'gene': GA_selection = genetic_selection(model1, "absv", X_train, Y_train, X_test, Y_test, pop_size=10, chrom_len=50, n_gen=50, coef=0.5, mut_rate=0.3, score_type=GA_score, selec_type=GA_selec) GA_selection.execute() GA_train_indexes = GA_selection.best_population() X_train, Y_train, X_test, Y_test = data.dataset( sample_name=sample_name, indexes=GA_train_indexes) # train the model we are analyzing model1.fit(X_train, Y_train) y_pred1 = model1.predict(X_test) prec1 = precision_score(Y_test, y_pred1) f1_1 = f1_score(Y_test, y_pred1) recall1 = recall_score(Y_test, y_pred1) acc1 = accuracy_score(Y_test, y_pred1) gmean1 = np.sqrt(prec1 * recall1) y_thresholds = model1.decision_thresholds(X_test, glob_dec=True) TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test) area1 = auc(FPR, TPR) model1.clean() p_values, stats, rejects, areas2, precs2, f1_s2, recalls2, accs2, gmeans2 = ( []), ([]), ([]), ([]), ([]), ([]), ([]), ([]), ([]) names = [] # call and train the models to compare, including the AUC calculation method model_auc2 = mm.model_loader(model, sample_name) for i in range(len(model_auc2)): model_auc2[i][0].fit(X_train, Y_train) y_pred2 = model_auc2[i][0].predict(X_test) prec2 = precision_score(Y_test, y_pred2) f1_2 = f1_score(Y_test, y_pred2) recall2 = recall_score(Y_test, y_pred2) acc2 = accuracy_score(Y_test, y_pred2) gmean2 = np.sqrt(prec2 * recall2) if model_auc2[i][1] == "absv": y_thresholds_2 = model_auc2[i][0].decision_thresholds( X_test, glob_dec=True) TPR_2, FPR_2 = du.roc_curve_adaboost(y_thresholds, Y_test) area2 = auc(FPR_2, TPR_2) elif model_auc2[i][1] == "prob": Y_pred_prob = model_auc2[i][0].predict_proba(X_test)[:, 1] area2 = roc_auc_score(Y_test, Y_pred_prob) elif model_auc2[i][1] == "deci": Y_pred_dec = model_auc2[i][0].decision_function(X_test) area2 = roc_auc_score(Y_test, Y_pred_dec) contingency, corrected = mcnemar_table(y_pred1, y_pred2, Y_test) if corrected: result = mcnemar(contingency, exact=False, correction=True) else: result = mcnemar(contingency, exact=True) print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue)) alpha = 0.05 if result.pvalue > alpha: reject_null = False print('Same proportions of errors, fail to reject H0') else: reject_null = True print('Different proportions of errors, reject H0') p_values = np.append(p_values, result.pvalue) stats = np.append(stats, result.statistic) rejects = np.append(rejects, reject_null) areas2 = np.append(areas2, area2) precs2 = np.append(precs2, prec2) f1_s2 = np.append(f1_s2, f1_2) recalls2 = np.append(recalls2, recall2) accs2 = np.append(accs2, acc2) gmeans2 = np.append(gmeans2, gmean2) names.append(model_auc2[i][2]) f_mcnemar = open('./tables/mcnemar_' + sample_name + '_' + model + '.tex', "w") dv.latex_table_mcnemar(names, p_values, stats, rejects, areas2, precs2, f1_s2, recalls2, accs2, gmeans2, area1, prec1, f1_1, recall1, acc1, gmean1, f_mcnemar)
def cross_validation(sample_name, model, roc_area, selection, GA_mut=0.25, GA_score='', GA_selec='', GA_coef=0.5, kfolds=1, n_reps=1, path='.'): # fetch data_frame without preparation data = data_preparation(path) sample_df_temp = data.fetch_data(sample_name) train_test = type( sample_df_temp) is tuple # are the data already splitted? if not train_test: sample_df = sample_df_temp else: sample_train_df, sample_test_df = sample_df_temp area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores = ( []), ([]), ([]), ([]), ([]), ([]), ([]) n_class_scores, n_train_scores = ([]), ([]) X, Y = data.dataset(sample_name=sample_name, data_set=sample_df, sampling=True, split_sample=0.0) from sklearn.model_selection import train_test_split # n-k fold cross validation, n_cycles = n_splits * n_repeats rkf = RepeatedKFold( n_splits=kfolds, n_repeats=n_reps, random_state=1) # set random state=1 for reproducibility for i in range(1): #train_index, test_index in rkf.split(X): # X_train, X_test = X.loc[train_index], X.loc[test_index] # Y_train, Y_test = Y.loc[train_index], Y.loc[test_index] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1) start = time.time() # keep the chromosome size under the limit [100,1000] sample_chromn_len = int(len(Y_train) * 0.25) if sample_chromn_len > 1000: sample_chromn_len = 1000 elif sample_chromn_len < 100: sample_chromn_len = 100 if selection == 'gene': # genetic selection GA_selection = genetic_selection(model, roc_area, X_train, Y_train, X_test, Y_test, pop_size=10, chrom_len=sample_chromn_len, n_gen=50, coef=GA_coef, mut_rate=GA_mut, score_type=GA_score, selec_type=GA_selec) GA_selection.execute() GA_train_indexes = GA_selection.best_population() X_train, Y_train, X_test, Y_test = data.dataset( sample_name=sample_name, indexes=GA_train_indexes) print(len(X_train), len(Y_test), len(GA_train_indexes), 'important check for GA outcome') print(len(Y_train[Y_train == 1]), 'important check for GA outcome') model.fit(X_train, Y_train) n_base_class = 0 no_zero_classifiers = True if roc_area == "absv": n_base_class = model.n_classifiers if n_base_class == 0: no_zero_classifiers = False if no_zero_classifiers: y_pred = model.predict(X_test) prec = precision_score(Y_test, y_pred) f1 = f1_score(Y_test, y_pred) recall = recall_score(Y_test, y_pred) acc = accuracy_score(Y_test, y_pred) gmean = np.sqrt(prec * recall) # calcualate roc-auc depending on the classifier if roc_area == "absv": y_thresholds = model.decision_thresholds(X_test, glob_dec=True) TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test) area = auc(FPR, TPR) model.clean() elif roc_area == "prob": Y_pred_prob = model.predict_proba(X_test)[:, 1] area = roc_auc_score(Y_test, Y_pred_prob) elif roc_area == "deci": Y_pred_dec = model.decision_function(X_test) area = roc_auc_score(Y_test, Y_pred_dec) end = time.time() time_scores = np.append(time_scores, end - start) area_scores = np.append(area_scores, area) prec_scores = np.append(prec_scores, prec) f1_scores = np.append(f1_scores, f1) recall_scores = np.append(recall_scores, recall) acc_scores = np.append(acc_scores, acc) gmean_scores = np.append(gmean_scores, gmean) n_class_scores = np.append(n_class_scores, n_base_class) n_train_scores = np.append(n_train_scores, len(X_train)) else: # this needs to be re-checked carefully end = time.time() time_scores = np.append(time_scores, end - start) area_scores = np.append(area_scores, 0) prec_scores = np.append(prec_scores, 0) f1_scores = np.append(f1_scores, 0) recall_scores = np.append(recall_scores, 0) acc_scores = np.append(acc_scores, 0) gmean_scores = np.append(gmean_scores, 0) n_class_scores = np.append(n_class_scores, 0) n_train_scores = np.append(n_train_scores, len(X_train)) return area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores, n_class_scores, n_train_scores