def predict_fake_input(model, task, title): generated = data_handler.load_fake_input(task) print('Number of generated conditions : ', generated.shape) if (task == 0): pred = model.predict_proba(generated) final_state = pd.Series(pred[:, 1], name='Pred_Result') elif (task == 1): pred = model.predict(generated) final_state = pd.Series(pred, name='Pred_Result') result = pd.concat([generated, final_state], axis=1) data_handler.save_csv(result, title + 'pred_fake_input')
def extract_feature_importance(model, X, title): print('Feature importance...') explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) shap.summary_plot(shap_values, feature_names=X.columns, plot_type="bar") # normalize importance values sum_col = abs(shap_values).sum(axis=0) imp = np.array(sum_col / sum_col.sum()) ind = np.argsort(imp)[::-1] sorted_imp = imp[ind] sorted_feature = X.columns[ind] feature_imp_sorted = pd.DataFrame([sorted_imp], columns=sorted_feature) print(feature_imp_sorted) data_handler.save_csv(feature_imp_sorted, title=title + 'feature_imp_sorted')
def plot_learning_curve_versus_tr_set_size(title='', save_csv=True, scoring='roc_auc'): # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set X, Y = data_handler.load_XY() _ylabel = 'Mean AUROC' outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=6) inner_cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=3) xgb_clf = XGBClassifier(objective="binary:logistic", min_child_weight=1, **{'tree_method': 'exact'}, silent=True, n_jobs=1, random_state=3, seed=3) tuned_parameters = dict(learning_rate=[0.01, 0.1], n_estimators=[100, 300, 500], colsample_bylevel=[0.5, 0.7, 0.9], gamma=[0, 0.2, 0.4], max_depth=[3, 5, 7], reg_lambda=[0.1, 1, 10], subsample=[0.4, 0.7, 1]) xgb_cv = GridSearchCV(xgb_clf, tuned_parameters, cv=inner_cv, scoring='roc_auc', verbose=0, n_jobs=1) _ylim = (0.5, 1.01) n_jobs = 4 train_sizes = np.linspace(.2, 1.0, 5) # create learning curve values train_sizes, train_scores, test_scores = learning_curve( xgb_cv, X, Y, cv=outer_cv, n_jobs=4, train_sizes=train_sizes, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) tr_size_df = pd.Series(train_sizes, name='training_set_size') tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean') cv_sc_m_df = pd.Series(test_scores_mean, name='cv_score_mean') tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std') cv_sc_std_df = pd.Series(test_scores_std, name='cv_score_std') if (save_csv): res = pd.concat( [tr_size_df, tr_sc_m_df, cv_sc_m_df, tr_sc_std_df, cv_sc_std_df], axis=1) data_handler.save_csv(data=res, title=title + '_learning_curve') # plotting _ylim = (0.5, 1.01) fig = plt.figure(figsize=(12, 12 / 1.618)) ax1 = fig.add_subplot(111) ax1.set_ylim(_ylim) ax1.set_xlabel("Number of Training Samples") ax1.set_ylabel(_ylabel) plt.grid(False) ax1.plot(tr_size_df, tr_sc_m_df, 'o-', color="r", label="Training") ax1.plot(tr_size_df, cv_sc_m_df, '^--', color="b", label="Cross-Validation") plt.setp(ax1.spines.values(), color='black') plt.legend(loc="lower right") plt.show() to_path = data_handler.format_title(to_dir, title + '_learning_curve', '.png') fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0) return to_path
def plot_learning_curve_versus_tr_epoch(title='', ntrials=1, nfolds=10, save_csv=False, verbose=True, save_fig=False): X_df, Y_df = data_handler.load_XY() X = X_df.values Y = Y_df.values _ylabel = 'Mean AUROC' n_jobs = 4 # cross validation settup Ntrials = ntrials outter_nsplit = nfolds tot_count = Ntrials * outter_nsplit # Results store train_mat = np.zeros((tot_count, 500)) test_mat = np.zeros((tot_count, 500)) for i in range(Ntrials): init_time = time.time() print("trial = ", i) train_index = [] test_index = [] outer_cv = StratifiedKFold(n_splits=outter_nsplit, shuffle=True, random_state=i) for train_ind, test_ind in outer_cv.split(X, Y): train_index.append(train_ind.tolist()) test_index.append(test_ind.tolist()) for j in range(outter_nsplit): #outter_nsplit count = i * outter_nsplit + j print(str(count), " / ", str(tot_count)) X_train = X[train_index[j]] Y_train = Y[train_index[j]] X_test = X[test_index[j]] Y_test = Y[test_index[j]] eval_sets = [(X_train, Y_train), (X_test, Y_test)] clf = XGBClassifier(objective="binary:logistic", min_child_weight=1, **{'tree_method': 'exact'}, silent=True, n_jobs=4, random_state=3, seed=3, learning_rate=0.01, colsample_bylevel=0.9, colsample_bytree=0.9, n_estimators=500, gamma=0.8, max_depth=11, reg_lambda=0.8, subsample=0.4) clf.fit(X_train, Y_train, eval_metric=['auc'], eval_set=eval_sets, verbose=False) results = clf.evals_result() epochs = len(results['validation_0']['auc']) # record results train_mat[count] = results['validation_0']['auc'] test_mat[count] = results['validation_1']['auc'] if (verbose): print('Iter: %d, epochs: %d' % (count, epochs)) print('training result: %.4f, testing result: %.4f' % (train_mat[count][499], test_mat[count][499])) print('total time: %.4f mins' % ((time.time() - init_time) / 60)) # Results store epoch_lists = list(range(1, epochs + 1)) train_results = pd.DataFrame( data=train_mat, columns=['epoch_' + str(i) for i in epoch_lists]) test_results = pd.DataFrame( data=test_mat, columns=['epoch_' + str(i) for i in epoch_lists]) if (save_csv): data_handler.save_csv(train_results, title='mos2_learning_curve_train_raw') data_handler.save_csv(test_results, title='mos2_learning_curve_test_raw') print('end') _ylim = (0.5, 1.01) n_jobs = 4 # create learning curve values train_scores_mean = np.mean(train_mat, axis=0) train_scores_std = np.std(train_mat, axis=0) test_scores_mean = np.mean(test_mat, axis=0) test_scores_std = np.std(test_mat, axis=0) tr_size_df = pd.Series(epoch_lists, name='training_epoch') tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean') val_sc_m_df = pd.Series(test_scores_mean, name='val_score_mean') tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std') val_sc_std_df = pd.Series(test_scores_std, name='val_score_std') if (save_csv): res = pd.concat( [tr_size_df, tr_sc_m_df, val_sc_m_df, tr_sc_std_df, val_sc_std_df], axis=1) data_handler.save_csv(data=res, title=title + '_learning_curve') # plotting _ylim = (0.5, 1.01) fig = plt.figure(figsize=(12, 12 / 1.618)) ax1 = fig.add_subplot(111) ax1.set_ylim(_ylim) ax1.set_xlabel("Number of Training Epochs") ax1.set_ylabel(_ylabel) plt.grid(False) ax1.plot(tr_size_df, tr_sc_m_df, color="r", label="Training") #'o-', ax1.plot(tr_size_df, val_sc_m_df, color="b", label="Validation") #'^--', # plot error bars #ax1.errorbar(tr_size_df, tr_sc_m_df, yerr=tr_sc_std_df,color="r", ) #ax1.errorbar(tr_size_df, val_sc_m_df, yerr=val_sc_std_df) plt.setp(ax1.spines.values(), color='black') plt.legend(loc="lower right") plt.show() to_path = None if save_fig: to_path = data_handler.format_title(to_dir, title + '_learning_curve', '.png') fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0.1) return to_path
def plot_ROC_curve(pipe, tuned_parameters, title='roc_curve', save_csv=True, task=0): # cross validation settup Ntrials = 1 outter_nsplit = 10 inner_nsplit = 10 # Results store Y_true = pd.Series(name='Y_true') pred_results = pd.Series(name='pred_prob') # load data assert (task == 0 or task == 2), 'Error: invalid task spec!' X_df, Y_df = data_handler.load_XY(task) X = X_df.values Y = Y_df.values for i in range(Ntrials): train_index = [] test_index = [] outer_cv = StratifiedKFold(n_splits=outter_nsplit, shuffle=True, random_state=i) for train_ind, test_ind in outer_cv.split(X, Y): train_index.append(train_ind.tolist()) test_index.append(test_ind.tolist()) for j in range(outter_nsplit): #outter_nsplit print("progress >> ", j, ' / ', outter_nsplit) X_train = X[train_index[j]] Y_train = Y[train_index[j]] X_test = X[test_index[j]] Y_test = Y[test_index[j]] inner_cv = StratifiedKFold(n_splits=inner_nsplit, shuffle=False, random_state=j) clf = GridSearchCV(pipe, tuned_parameters, cv=inner_cv, scoring='roc_auc') clf.fit(X_train, Y_train) pred = pd.Series(clf.predict_proba(X_test)[:, 1]) pred_results = pd.concat([pred_results, pred], axis=0, ignore_index=True) Y_test_df = pd.Series(Y_test, name='Y_test') Y_true = pd.concat([Y_true, Y_test_df], axis=0, ignore_index=True) # plotting fpr, tpr, thresholds = metrics.roc_curve(Y_true, pred_results) roc_auc = metrics.auc(fpr, tpr) auc_value = metrics.roc_auc_score(Y_true, pred_results) fig = plt.figure(figsize=(12, 12 / 1.618)) ax1 = fig.add_subplot(111) labl = np.linspace(0, 1, 6) labels = [float("{0:.2f}".format(x)) for x in labl] ax1.set_xticks(labels) ax1.set_xticklabels(labels) labels[0] = '' ax1.set_yticklabels(labels) plt.grid(False) ax1.plot(fpr, tpr, lw=2, label='ROC curve (area = {:.2f})'.format(auc_value), marker='.', linestyle='-', color='b') ax1.plot([0, 1], [0, 1], linestyle='--', color='k') ax1.set_xlabel('False Positive Rate') ax1.set_ylabel('True Positive Rate') ax1.set_xlim(0, 1) ax1.set_ylim(0, 1) ax1.legend(loc='lower right') color = 'black' plt.setp(ax1.spines.values(), color=color) ax1.yaxis.set_visible(True) ax1.xaxis.set_visible(True) ax1.yaxis.set_ticks_position('left') ax1.xaxis.set_ticks_position('bottom') ax1.get_yaxis().set_tick_params(direction='out', width=2) plt.show() fig.savefig(data_handler.format_title(to_dir, title + '_ROC_curve', '.png'), dpi=1000, bbox_inches="tight", pad_inches=0) # save results to csv if true if save_csv: data_mat = np.array([fpr, tpr]).T ret = pd.DataFrame(data_mat, columns=['fpr', 'tpr']) data_handler.save_csv(ret, title + '_ROC_curve') return True
def PAM_regression(save_csv=False, verbose=False, to_break=True, title='cqd_PAM_', batch=1): ## start PAM guided synthesis... init_time = time.time() Nc = 0 #construct initial training set results_mat = np.zeros(((totalSamp - init_train_size), 12)) train_ind = random.sample(all_ind_wo_max, init_train_size) test_ind = [x for x in all_ind if x not in train_ind] if (verbose): print('initial training set indexes', train_ind) # set up result storage to compute eval metrics, in the order of PAM # ignore the initial training set, as it is not determined by PAM pred_results = np.zeros(totalSamp - init_train_size) true_results = np.zeros(totalSamp - init_train_size) # setup the hyperparameter range for tuning tuned_parameters = dict( learning_rate=[0.01], n_estimators=[300, 500, 700], #100,,300,400,500 colsample_bylevel=[0.5, 0.7, 0.9], gamma=[0, 0.2], #0,0.1,0.2,0.3,0.4 max_depth=[3, 7, 11], # [3,7,11]] reg_lambda=[0.1, 1, 10], #[0.1,1,10] # reg_alpha = [1], subsample=[0.4, 0.7, 1]) j = 0 loop_count = 0 mean_y_only_init = np.mean(Y[train_ind]) std_y_only_init = np.std(Y[train_ind]) while (j < totalSamp - init_train_size): inner_cv = KFold(n_splits=inner_nsplits, shuffle=True, random_state=j) X_train = X[train_ind] Y_train = Y[train_ind] X_test = X[test_ind] Y_test = Y[test_ind] last_max = np.max(Y_train) # GradientBoost reg = xgb.XGBRegressor(objective="reg:linear", min_child_weight=1, **{'tree_method': 'exact'}, silent=True, n_jobs=4, random_state=3, seed=3) gb_clf = GridSearchCV(reg, tuned_parameters, cv=inner_cv, scoring='r2', verbose=0, n_jobs=4) gb_clf.fit(X_train, Y_train) y_pred = gb_clf.predict(X_test) # choose the batch of conditions with best predicted yield best_pos_ind = np.argsort(-y_pred)[:batch] best_prob = y_pred[best_pos_ind] next_ind = np.array(test_ind)[best_pos_ind] # update results storage train_size = len(Y_train) temp = list(range(0, len(y_pred))) ind_notbest = [x for x in temp if x not in best_pos_ind] start_ptr = j end_ptr = np.min([start_ptr + batch, totalSamp - init_train_size]) pred_results[start_ptr:end_ptr] = best_prob pred_results[end_ptr:totalSamp - init_train_size] = y_pred[ind_notbest] true_results[start_ptr:end_ptr] = Y_test[best_pos_ind] true_results[end_ptr:totalSamp - init_train_size] = Y_test[ind_notbest] pred_metrics = test(pred_results, true_results, end_ptr - 1) # calculate results next_best_true_ind = next_ind[np.argmax(Y_test[best_pos_ind])] next_best_y_true = np.max(Y_test[best_pos_ind]) result_list = [ train_size, next_best_true_ind, next_best_y_true, best_prob[0], ] + pred_metrics results_mat[loop_count, :] = np.array(result_list) loop_count = loop_count + 1 j = j + batch if (verbose): print(loop_count, '->', j, ', best_next_ind=', next_best_true_ind, ' best_Y_true=', "{0:.6f}".format(next_best_y_true), ' train_max=', "{0:.6f}".format(last_max), ' r2=', pred_metrics[0]) train_ind = [*train_ind, *next_ind] test_ind = [x for x in test_ind if x not in next_ind] ## critical point if (next_best_y_true == Y_global_max and Nc == 0): Nc = j + init_train_size if (to_break): break saved_title = '-' if (save_csv): results = pd.DataFrame(data=results_mat[0:j, :], columns=[ 'sample_size', 'pred_ind', 'best_pred_result', 'y_true', 'r2', 'pearson', 'p_value', 'mse', 'r2_s', 'pearson_s', 'p_value_s', 'mse_s' ]) saved_title = data_handler.save_csv(results, title=title) # compute stats mean_y_wo_init = np.mean(true_results[0:j]) std_y_wo_init = np.std(true_results[0:j]) mean_y_w_init = np.mean(Y[train_ind]) std_y_w_init = np.std(Y[train_ind]) run_time = (time.time() - init_time) / 60 return [ saved_title, Nc, mean_y_wo_init, std_y_wo_init, mean_y_w_init, std_y_w_init, mean_y_only_init, std_y_only_init, run_time ]
# save the results some repetitions for backup for j in range(0, outer_loop): #PAM_results = np.zeros((inner_loop,9)) init_time = time.time() res_arr = [] for i in range(0, inner_loop): loop_count = j * inner_loop + i result = PAM_regression(save_csv=False, verbose=False, to_break=True, title='cqd_PAM_' + str(loop_count) + 'th_loop_') res_arr.append(result) print(str(loop_count), ' -> ', str(result[0]), ' time=', result[len(result) - 1]) PAM_df = pd.DataFrame(data=res_arr, columns=[ 'file-name', 'num_experiments', 'mean_y_wo_init', 'std_y_wo_init', 'mean_y_w_init', 'std_y_w_init', 'mean_y_only_init', 'std_y_only_init', 'run_time' ]) saved_path = data_handler.save_csv(PAM_df, title='cqd_PAM_' + str(inner_loop) + 'times_') print('total = ', str((time.time() - init_time) / 3600), ' hrs >>-------saved')
def PAM_classfication(verbose=False, save_csv=False, to_break=True, title='mos2_PAM_'): ''' PAM of classification problem. Arguments: verbose : Bool. save_csv: Bool. Whether to save detailed results of the PAM into csv file to_break: Bool. Whether to reinforce additional stopping condition when critical point is found Return: [Nc, results[Nc,:]] : Nc is the critical point ''' #critical point Nc = 0 init_time = time.time() # setup initial sets init_sets = generate_init_sets() train_ind = init_sets['train_ind'] test_ind = init_sets['test_ind'] if (verbose): print('initial training set indexes', train_ind) # Results store init_train_size = len(train_ind) init_cnot_count = list(Y[train_ind]).count(0) init_can_count = init_train_size - init_cnot_count results_mat = np.zeros((totalSamp - init_train_size, 8)) # setup hyperparameter range to tune tuned_parameters = dict( learning_rate=[0.01], #0.01,0.1,0.2,0.3 n_estimators=[100, 300, 500], #100 gamma=[0, 0.2, 0.4], #0,0.1,0.2,0.3,0.4 max_depth=[5, 7, 9, 11], # [4,5,6] reg_lambda=[0.1, 1, 10], colsample_bylevel=[0.9], subsample=[0.4, 0.7, 1]) # start PAM guided synthesis... for j in range(totalSamp): #outter_nspliT inner_cv = StratifiedKFold( n_splits=inner_nsplits, shuffle=True, random_state=j ) #StratifiedKFold(n_splits=inner_nsplits, random_state=j) X_train = X[train_ind] Y_train = Y[train_ind] X_test = X[test_ind] Y_test = Y[test_ind] #count pos/neg of training set tr_zero_count = list(Y_train).count(0) tr_total_count = len(train_ind) pos_tr = tr_total_count - tr_zero_count # GradientBoost pipe = xgb.XGBClassifier(objective='binary:logistic', min_child_weight=1, **{'tree_method': 'exact'}, silent=True, n_jobs=4, random_state=3, seed=3, scale_pos_weight=1) gb_clf = GridSearchCV(pipe, tuned_parameters, cv=inner_cv, scoring='roc_auc', verbose=0, n_jobs=4) gb_clf.fit(X_train, Y_train) result_list, next_ind, best_prob, fp_ts, fn_ts = test( gb_clf, X_test, Y_test) # calculate results type1_err = (fp_ts + tr_zero_count - init_cnot_count) / (tot_cnot_count - init_cnot_count) type2_err = fn_ts / (tot_can_count - init_can_count) results_mat[j, :] = np.array([tr_total_count] + result_list + [best_prob, pos_tr, type1_err, type2_err]) next_ind = test_ind[next_ind] if (verbose): print(j, 'loop, next_ind=', next_ind, ' #tr=', tr_total_count, ' pos_tr=', pos_tr, ' best_prob=', "{0:.6f}".format(best_prob), ' type1=', "{0:.6f}".format(type1_err), ' type2=', "{0:.6f}".format(type2_err)) # critical point if ((best_prob < 0.5) and (Nc == 0)): Nc = tr_total_count if (to_break): break #stopping condition if (pos_tr == tot_can_count): break #update train/test sets train_ind = train_ind + [next_ind] test_ind.remove(next_ind) saved_title = '-' if (save_csv): results_df = pd.DataFrame(data=results_mat[0:j + 1], columns=[ 'sample_size', 'acc_ts', 'tpr_ts', 'tnr_ts', 'best_prob', 'pos_tr', 'type1_err', 'type2_err' ]) saved_title = data_handler.save_csv(results_df, title=title) run_time = (time.time() - init_time) / 60 return [saved_title, Nc] + results_mat[j].tolist() + [run_time]