def learn(X_trains, X_tests, y_trains, y_tests, k_fold, clf_params): #create classifier clf = XGBClassifier(max_depth=int(clf_params['max_depth']), learning_rate=clf_params['lr'], n_estimators=int(clf_params['estimators']), objective='binary:logistic', gamma=clf_params['gamma'], min_child_weight=int(clf_params['min_child_weight']), reg_lambda=clf_params['lambda'], booster='gbtree', alpha=clf_params['alpha']) y_train_scores, y_test_scores, y_train_preds, y_test_preds = [], [], [], [] for i in range(k_fold): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[ i], y_tests[i] clf.fit(X_train, y_train) clf.predict_proba(X_test) y_score = clf.predict_proba(X_test) y_pred = clf.predict(X_test) y_test_preds.append(y_pred) y_test_scores.append(y_score[:, 1]) train_pred = clf.predict(X_train) train_score = clf.predict_proba(X_train) y_train_preds.append(train_pred) y_train_scores.append(train_score[:, 1]) print_auc_for_iter(np.array(y_tests[i]['Tag'].values), np.array(y_score).T[1]) all_y_train = [] for i in range(k_fold): all_y_train.append(y_trains[i]['Tag'].values) all_y_train = np.array(all_y_train).flatten() all_y_test = [] for i in range(k_fold): all_y_test.append(y_tests[i]['Tag'].values) all_y_test = np.array(all_y_test).flatten() y_train_scores = np.array(y_train_scores).flatten() y_test_scores = np.array(y_test_scores).flatten() #calc AUC on validation set _, test_auc, _, _ = calc_auc_on_flat_results(all_y_train, y_train_scores, all_y_test, y_test_scores) return test_auc
def fit(self, X, y, X_train_ids, X_test_ids, y_train_ids, y_test_ids, params, bacteria, task_name_title, relative_path_to_save_results, pca_obj=None): if not os.path.exists( os.path.join(relative_path_to_save_results, "XGBOOST")): os.makedirs(os.path.join(relative_path_to_save_results, "XGBOOST")) os.chdir( os.path.join(os.path.abspath(os.path.curdir), relative_path_to_save_results, "XGBOOST")) print("XGBOOST...") # update each classifier results in a mutual file xgb_results_file = Path("all_xgb_results.csv") if not xgb_results_file.exists(): all_xgb_results = pd.DataFrame(columns=[ 'LR', 'MAX-DEPTH', 'N-ESTIMATORS', 'OBJECTIVE', 'GAMMA', 'MIN-CHILD-WEIGHT', 'BOOSTER', 'TRAIN-AUC', 'TRAIN-ACC', 'TEST-AUC', 'TEST-ACC', 'PRECISION', 'RECALL' ]) all_xgb_results.to_csv(xgb_results_file, index=False) num_of_classes = len(set(y)) BINARY = True if num_of_classes == 2 else False optional_classifiers = self.create_classifiers(params) for clf in optional_classifiers: all_xgb_results = pd.read_csv(xgb_results_file) clf_folder_name = "d=" + str(clf.max_depth) + "_lr=" + str(clf.learning_rate) + "_e=" + \ str(clf.n_estimators) + "_o=" + clf.objective + "_g=" + str(clf.gamma) + "_m=" + \ str(clf.min_child_weight) + "_b=" + clf.booster if not os.path.exists(clf_folder_name): os.makedirs(clf_folder_name) # Split the data set X_trains, X_tests, y_trains, y_tests, xgb_coefs = [], [], [], [], [] xgb_y_test_from_all_iter, xgb_y_score_from_all_iter = np.array( []), np.array([]) xgb_y_pred_from_all_iter, xgb_class_report_from_all_iter = np.array( []), np.array([]) xgb_coefs, bacteria_coeff_average, y_train_scores, y_test_scores = [], [], [], [] train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_test_preds = [], [], [], [], [] for i in range(params["K_FOLD"]): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = np.array( X.loc[X_train_ids[i]]), np.array( X.loc[X_test_ids[i]]), np.array( y[y_train_ids[i]]), np.array(y[y_test_ids[i]]) X_trains.append(X_train) X_tests.append(X_test) y_trains.append(y_train) y_tests.append(y_test) clf.fit(X_train, y_train) clf.predict_proba(X_test) y_score = clf.predict_proba(X_test) y_pred = clf.predict(X_test) y_test_preds.append(y_pred) y_test_scores.append(y_score[:, 0]) xgb_class_report = classification_report(y_test, y_pred) train_pred = clf.predict(X_train) train_score = clf.predict_proba(X_train) y_train_preds.append(train_pred) y_train_scores.append(train_score[:, 0]) train_accuracies.append( accuracy_score(y_train, clf.predict(X_train))) test_accuracies.append(accuracy_score( y_test, y_pred)) # same as - clf.score(X_test, y_test) confusion_matrixes.append(confusion_matrix(y_test, y_pred)) if BINARY: self.print_auc_for_iter(np.array(y_test), np.array(y_score).T[0]) self.save_y_test_and_score(y_test, y_pred, y_score, xgb_class_report) # --------------------------------------------! COEFF PLOTS ----------------------------------------- if params["create_coeff_plots"]: svm_coefs, coefficients, bacteria_coeff_average = \ self.calc_bacteria_coeff_average(num_of_classes, pca_obj, bacteria, clf, xgb_coefs, bacteria_coeff_average) # --------------------------------------------! AUC ----------------------------------------- all_y_train = np.array(y_trains).flatten() all_predictions_train = np.array(y_train_preds).flatten() y_train_scores = np.array(y_train_scores).flatten() all_test_real_tags = np.array(y_tests).flatten() all_test_pred_tags = np.array(y_test_preds).flatten() y_test_scores = np.array(y_test_scores).flatten() train_auc, test_auc, train_rho, test_rho = \ calc_auc_on_flat_results(all_y_train, y_train_scores, all_test_real_tags, y_test_scores) # ----------------------------------------! CONFUSION MATRIX ------------------------------------- print("------------------------------") names = params["CLASSES_NAMES"] confusion_matrix_average, confusion_matrix_acc = edit_confusion_matrix( confusion_matrixes, "XGB", names, BINARY=BINARY) if BINARY: _, _, _, xgb_roc_auc = roc_auc(all_test_real_tags.astype(int), y_test_scores, visualize=True, graph_title='XGB\n' + task_name_title.capitalize() + " AUC on all iterations", save=True, folder=clf_folder_name) res_path = os.path.join(clf_folder_name, str(round(xgb_roc_auc, 5))) else: xgb_roc_auc = 0 res_path = clf_folder_name if not os.path.exists(res_path): os.mkdir(res_path) if params["create_coeff_plots"]: self.plot_bacteria_coeff_average(bacteria_coeff_average, len(set(y)), params["TASK_TITLE"], clf_folder_name, bacteria, params["K_FOLD"], "XGB", res_path, BINARY, names) print_confusion_matrix(confusion_matrix_average, names, confusion_matrix_acc, "XGB", task_name_title, res_path) if BINARY: _, _, _, xgb_train_roc_auc = roc_auc(all_y_train, y_train_scores, visualize=False, graph_title="train auc", save=False, folder=res_path) else: xgb_train_roc_auc = 0 multi_class_roc_auc(all_y_train.astype(int), y_train_scores, names, graph_title='XGB\n' + task_name_title.capitalize() + " AUC on all iterations", save=True, folder=res_path) # ----------------------------------------! SAVE RESULTS ------------------------------------- self.save_results(task_name_title, train_auc, test_auc, train_rho, test_rho, confusion_matrix_average, confusion_matrix_acc, train_accuracies, test_accuracies, xgb_y_score_from_all_iter, xgb_y_pred_from_all_iter, xgb_y_test_from_all_iter, "XGB", res_path) all_xgb_results.loc[len(all_xgb_results)] = [ clf.learning_rate, clf.max_depth, clf.n_estimators, clf.objective, clf.gamma, clf.min_child_weight, clf.booster, xgb_train_roc_auc, np.mean(train_accuracies), xgb_roc_auc, np.mean(test_accuracies), precision_score(all_test_real_tags.astype(int), all_test_pred_tags, average='micro'), recall_score(all_test_real_tags.astype(int), all_test_pred_tags, average='micro') ] if BINARY: all_xgb_results = all_xgb_results.sort_values(by=['TEST-AUC'], ascending=False) else: all_xgb_results = all_xgb_results.sort_values(by=['TEST-ACC'], ascending=False) all_xgb_results.to_csv(xgb_results_file, index=False)
y_score = clf_score.predict_proba(X_test) y_test_scores.append(y_score[:, 1]) train_score = clf_score.predict_proba(X_train) y_train_scores.append(train_score[:, 1]) #calc AUC per iteration fpr, tpr, thresholds = roc_curve(np.array(y_test), np.array(np.array(y_score).T[1])) roc_auc = auc(fpr, tpr) print('ROC AUC = ' + str(round(roc_auc, 4))) #calc AUC on all iterations all_y_train = [] for i in range(k_fold): all_y_train.append(y_trains[i].values) all_y_train = np.array(all_y_train).flatten() all_y_test = [] for i in range(k_fold): all_y_test.append(y_tests[i].values) all_y_test = np.array(all_y_test).flatten() y_train_scores = np.array(y_train_scores).flatten() y_test_scores = np.array(y_test_scores).flatten() train_auc, test_auc, train_rho, test_rho = calc_auc_on_flat_results( all_y_train, y_train_scores, all_y_test, y_test_scores) ''' test_auc, acc = nn_main(X, y, params, 'GDM_extra_features', Net, plot=True, k_fold=5) print('Final auc: ' +str(test_auc)) nni.report_final_result(test_auc) '''
def fit(self, X, y, X_train_ids, X_test_ids, y_train_ids, y_test_ids, params, weights, bacteria, task_name_title, relative_path_to_save_results, pca_obj=None): if not os.path.exists( os.path.join(relative_path_to_save_results, "SVM")): os.makedirs(os.path.join(relative_path_to_save_results, "SVM")) os.chdir( os.path.join(os.path.abspath(os.path.curdir), relative_path_to_save_results, "SVM")) print("SVM...") # update each classifier results in a mutual file svm_results_file = Path("all_svm_results.csv") if not svm_results_file.exists(): all_svm_results = pd.DataFrame(columns=[ 'KERNEL', 'GAMMA', 'C', 'TRAIN-AUC', 'TRAIN-ACC', 'TEST-AUC', 'TEST-ACC', 'PRECISION', 'RECALL' ]) all_svm_results.to_csv(svm_results_file, index=False) num_of_classes = len(set(y)) BINARY = True if num_of_classes == 2 else False optional_classifiers = self.create_classifiers(params, weights) for clf in optional_classifiers: all_svm_results = pd.read_csv(svm_results_file) clf_folder_name = "k=" + clf.kernel + "_c=" + str( clf.C) + "_g=" + clf.gamma if not os.path.exists(clf_folder_name): os.makedirs(clf_folder_name) X_trains, X_tests, y_trains, y_tests, svm_coefs = [], [], [], [], [] svm_y_test_from_all_iter, svm_y_score_from_all_iter = np.array( []), np.array([]) svm_y_pred_from_all_iter, svm_class_report_from_all_iter = np.array( []), np.array([]) train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_train_scores,\ y_test_preds , y_test_scores = [], [], [], [], [], [], [] bacteria_coeff_average = [] for i in range(params["K_FOLD"]): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X.loc[ X_train_ids[i]], X.loc[X_test_ids[i]], y[ y_train_ids[i]], y[y_test_ids[i]] X_trains.append(X_train) X_tests.append(X_test) y_trains.append(y_train) y_tests.append(y_test) # FIT clf.fit(X_train, y_train) # GET RESULTS y_score = clf.decision_function(X_test) y_pred = clf.predict(X_test) y_test_preds.append(y_pred) svm_class_report = classification_report(y_test, y_pred).split("\n") train_pred = clf.predict(X_train) train_score = clf.decision_function(X_train) y_train_preds.append(train_pred) y_train_scores.append(train_score) y_test_scores.append(y_score) # SAVE RESULTS train_accuracies.append(accuracy_score(y_train, train_pred)) test_accuracies.append(accuracy_score(y_test, y_pred)) confusion_matrixes.append(confusion_matrix(y_test, y_pred)) if BINARY: self.print_auc_for_iter(np.array(y_test), np.array(y_score)) self.save_y_test_and_score(y_test, y_pred, y_score, svm_class_report) # --------------------------------------------! COEFF PLOTS ----------------------------------------- if params["create_coeff_plots"]: svm_coefs, coefficients, bacteria_coeff_average = \ self.calc_bacteria_coeff_average(num_of_classes, pca_obj, bacteria, clf, svm_coefs, bacteria_coeff_average) # --------------------------------------------! AUC ----------------------------------------- all_y_train = np.array(y_trains).flatten() all_predictions_train = np.array(y_train_preds).flatten() y_train_scores = np.array(y_train_scores).flatten() all_test_real_tags = np.array(y_tests).flatten() all_test_pred_tags = np.array(y_test_preds).flatten() y_test_scores = np.array(y_test_scores).flatten() train_auc, test_auc, train_rho, test_rho = \ calc_auc_on_flat_results(all_y_train, y_train_scores, all_test_real_tags, y_test_scores) # ----------------------------------------! CONFUSION MATRIX ------------------------------------- print("------------------------------") names = params["CLASSES_NAMES"] confusion_matrix_average, confusion_matrix_acc = edit_confusion_matrix( confusion_matrixes, "SVM", names, BINARY=BINARY) if BINARY: _, _, _, svm_roc_auc = roc_auc(all_test_real_tags.astype(int), y_test_scores, visualize=True, graph_title='SVM\n' + task_name_title.capitalize() + " AUC on all iterations", save=True, folder=clf_folder_name) res_path = os.path.join(clf_folder_name, str(round(svm_roc_auc, 5))) else: svm_roc_auc = 0 res_path = clf_folder_name if not os.path.exists(res_path): os.mkdir(res_path) if params["create_coeff_plots"]: self.plot_bacteria_coeff_average(bacteria_coeff_average, len(set(y)), params["TASK_TITLE"], clf_folder_name, bacteria, params["K_FOLD"], "SVM", res_path, BINARY, names) print_confusion_matrix(confusion_matrix_average, names, confusion_matrix_acc, "SVM", task_name_title, res_path) if BINARY: _, _, _, svm_train_roc_auc = roc_auc(all_y_train, y_train_scores, visualize=False, graph_title="train auc", save=False, folder=res_path) else: svm_train_roc_auc = 0 multi_class_roc_auc(all_y_train.astype(int), y_train_scores, names, graph_title='SVM\n' + task_name_title.capitalize() + " AUC on all iterations", save=True, folder=res_path) # ----------------------------------------! SAVE RESULTS ------------------------------------- self.save_results(task_name_title, train_auc, test_auc, train_rho, test_rho, confusion_matrix_average, confusion_matrix_acc, train_accuracies, test_accuracies, svm_y_score_from_all_iter, svm_y_pred_from_all_iter, svm_y_test_from_all_iter, "SVM", res_path) all_svm_results.loc[len(all_svm_results)] = [ clf.kernel, clf.C, clf.gamma, svm_train_roc_auc, np.mean(train_accuracies), svm_roc_auc, np.mean(test_accuracies), precision_score(all_test_real_tags.astype(int), all_test_pred_tags, average='micro'), recall_score(all_test_real_tags.astype(int), all_test_pred_tags, average='micro') ] if BINARY: all_svm_results = all_svm_results.sort_values(by=['TEST-AUC'], ascending=False) else: all_svm_results = all_svm_results.sort_values(by=['TEST-ACC'], ascending=False) all_svm_results.to_csv(svm_results_file, index=False)
def learn(X_trains, X_tests, y_trains, y_tests, k_fold, task): all_y_train = [] for i in range(k_fold): all_y_train.append(y_trains[i]['Tag'].values) all_y_train = np.array(all_y_train).flatten() all_y_test = [] for i in range(k_fold): all_y_test.append(y_tests[i]['Tag'].values) all_y_test = np.array(all_y_test).flatten() #SVM clf = svm.SVC(kernel='linear', C=0.1, gamma='scale', class_weight='balanced') y_test_scores, y_train_scores = [], [] tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i in range(k_fold): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[ i], y_tests[i], # FIT clf.fit(X_train, y_train) # GET RESULTS y_score = clf.decision_function(X_test) train_score = clf.decision_function(X_train) y_train_scores.append(train_score) y_test_scores.append(y_score) ''' viz = plot_roc_curve(clf, X_test, y_test, name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ''' print_auc_for_iter(np.array(y_tests[i]['Tag'].values), np.array(y_score).T) ''' mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs)/np.sqrt(k_fold) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic " + task) ax.legend(loc="lower right") plt.savefig(task + ".svg") ''' y_train_scores = np.array(y_train_scores).flatten() y_test_scores = np.array(y_test_scores).flatten() SVM_train_auc, SVM_test_auc, _, _ = calc_auc_on_flat_results( all_y_train, y_train_scores, all_y_test, y_test_scores) #XGBOOST clf = XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=100, objective='binary:logistic', gamma=0.5, min_child_weight=3, booster='gbtree') y_test_scores, y_train_scores = [], [] for i in range(k_fold): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[ i], y_tests[i], # FIT clf.fit(X_train, y_train) # GET RESULTS y_score = clf.predict_proba(X_test) train_score = clf.predict_proba(X_train) y_train_scores.append(train_score[:, 1]) y_test_scores.append(y_score[:, 1]) print_auc_for_iter(np.array(y_tests[i]['Tag'].values), np.array(y_score).T[1]) y_train_scores = np.array(y_train_scores).flatten() y_test_scores = np.array(y_test_scores).flatten() XGB_train_auc, XGB_test_auc, _, _ = calc_auc_on_flat_results( all_y_train, y_train_scores, all_y_test, y_test_scores) #NN NN_test_auc = 0 NN_train_auc = 0 for i in range(k_fold): Net = models_nn['relu_b'] print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[ i], y_tests[i] train_auc, test_auc = nn_model(X_train, X_test, y_train, y_test, Net) NN_train_auc += train_auc NN_test_auc += test_auc NN_train_auc /= k_fold NN_test_auc /= k_fold return SVM_train_auc, SVM_test_auc, XGB_train_auc, XGB_test_auc, NN_train_auc, NN_test_auc
def learn_XGBOOST(X_trains, X_tests, y_trains, y_tests, k_fold, clf_params, clf_ens_params, df_concat): #create classifier clf_score = XGBClassifier(max_depth=int(clf_params['max_depth']), learning_rate=clf_params['lr'], n_estimators=int(clf_params['estimators']), objective='binary:logistic', gamma=clf_params['gamma'], min_child_weight=int( clf_params['min_child_weight']), reg_lambda=clf_params['lambda'], booster='dart', alpha=clf_params['alpha']) ens_clf = XGBClassifier(max_depth=int(clf_ens_params['max_depth']), learning_rate=clf_ens_params['lr'], n_estimators=int(clf_ens_params['estimators']), objective='binary:logistic', gamma=clf_ens_params['gamma'], min_child_weight=int( clf_ens_params['min_child_weight']), reg_lambda=clf_ens_params['lambda'], booster='dart', alpha=clf_ens_params['alpha']) y_train_scores, y_test_scores, y_train_scores_ens, y_test_scores_ens = [], [], [], [] all_y_train_ens, all_y_test_ens = [], [] for i in range(k_fold): print('------------------------------\niteration number ' + str(i)) X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[ i], y_tests[i] #train XGBOOST model clf_score.fit(X_train, y_train) clf_score.predict_proba(X_test) y_score = clf_score.predict_proba(X_test) y_test_scores.append(y_score[:, 1]) train_score = clf_score.predict_proba(X_train) y_train_scores.append(train_score[:, 1]) #building new data frame for learning all model score_train_df = build_score_df(X_train, train_score[:, 1]) X_train_ens, y_train_ens = create_concate_df_to_learn( score_train_df, df_concat) all_y_train_ens.append(y_train_ens.values) score_test_df = build_score_df(X_test, y_score[:, 1]) X_test_ens, y_test_ens = create_concate_df_to_learn( score_test_df, df_concat) all_y_test_ens.append(y_test_ens.values) #train all model using score predictions of XGBOOST ens_clf.fit(X_train_ens, y_train_ens) ens_clf.predict_proba(X_test_ens) y_score_ens = ens_clf.predict_proba(X_test_ens) y_test_scores_ens.append(y_score_ens[:, 1]) train_score_ens = ens_clf.predict_proba(X_train_ens) y_train_scores_ens.append(train_score_ens[:, 1]) #print auc of each fold print('iner model') print_auc_for_iter(np.array(y_test), np.array(y_score).T[1]) print('ensemble model') print_auc_for_iter(np.array(y_test_ens), np.array(y_score_ens).T[1]) # calc AUC on validation set_inner model all_y_train = [] for i in range(k_fold): all_y_train.append(y_trains[i]['Tag'].values) all_y_train = np.array(all_y_train).flatten() all_y_test = [] for i in range(k_fold): all_y_test.append(y_tests[i]['Tag'].values) all_y_test = np.array(all_y_test).flatten() y_train_scores = np.array(y_train_scores).flatten() y_test_scores = np.array(y_test_scores).flatten() print('Inner Model') _, test_auc, _, _ = calc_auc_on_flat_results(all_y_train, y_train_scores, all_y_test, y_test_scores) # calc AUC on validation set ensemble model all_y_train_ens = np.array(all_y_train_ens).flatten() all_y_test_ens = np.array(all_y_test_ens).flatten() y_train_scores = np.array(y_train_scores_ens).flatten() y_test_scores = np.array(y_test_scores_ens).flatten() print('Ensemble Model') _, test_auc, _, _ = calc_auc_on_flat_results(all_y_train_ens, y_train_scores, all_y_test_ens, y_test_scores) return test_auc