def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001): i = 0 folds_AUC_testing = [] folds_AUPR_testing = [] folds_AUC_training = [] folds_AUPR_training = [] test_true_predict_compare = [] train_true_predict_compare = [] for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): # X_train, X_test = X_train[:,12:], X_test[:,12:] # X_train, X_test = X_train[:,:12], X_test[:,:12] config = get_toy_config(trees,max_depth, min_child_weight,cw,layer) gc = GCForest(config) #print(config) X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) y_predprob_test_df = pd.DataFrame(y_predprob_test) y_predprob_train_df = pd.DataFrame(y_predprob_train) test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1) AUPR_training = auc(recall_training,precision_training) AUPR_testing = auc(recall_testing, precision_testing) AUC_training = roc_auc_score(y_train, y_predprob_train[:,1]) AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1]) folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]
def run_model(features, adhd_labels, rand_params, verbose=True, test_size=0.2): """ Run the gcForest using parameters from the Optimizer. Use random portions of the original dataset for testing and training (default 20%-80%) :param kind: (str) The type of functional connectivity we want to use :param features: (list) A matrix containing phenotypic and functional connectivity c :param adhd_labels: (list) The correct labels from the dataset :param rand_params: (dict) The generated random params from the Optimizer :param verbose: (bool) Whether to print classification report :param test_size: (float) How much of the dataset to use for testing :return: (float) accuracy, (float) f1, (float) precision, (float) recall """ classifier = GCForest( # Instantiate the gcForest algorithm using the random parameters we generated config=generate_gcforest_config(rand_params['mlp_layers'], rand_params['mlp_solver'], rand_params['logistic_regressions'], rand_params['svc_kernel'], rand_params['xgb_estimators'], rand_params['rf_estimators'], rand_params['early_stopping_iterations'], rand_params['positions']), ) X_train, X_test, y_train, y_test = train_test_split(features, adhd_labels, test_size=test_size) # Split the data into random subsets (20% test, 80% train by default) classifier.fit_transform(np.array(X_train), np.array(y_train)) # Train the gcForest model y_pred = classifier.predict(np.array(X_test)) # Predict off of the test dataset y_test = np.array(y_test) if verbose: print "Classification Report\n", classification_report(y_test, y_pred) # Print out some useful run information print "Accuracy:", accuracy_score(y_test, y_pred) print "Confusion Matrix\n", confusion_matrix(y_test, y_pred) positive_metrics = { 'f1': f1_score(y_test, y_pred), # Calculate the f1 for class "1" 'precision': precision_score(y_test, y_pred), # Calculate the precision for class "1" 'recall': recall_score(y_test, y_pred), # Calculate the recall for class "1" } negative_metrics = { 'f1': f1_score(y_test, y_pred, pos_label=0), # Calculate the f1 for class "0" 'precision': precision_score(y_test, y_pred, pos_label=0), # Calculate the precision for class "0" 'recall': recall_score(y_test, y_pred, pos_label=0), # Calculate the recall for class "0" } matrix = confusion_matrix(y_test, y_pred) confusion = { # Return the attributes of the confusion matrix 'true_negative': matrix[0][0], # Predicted false and is false 'false_positive': matrix[0][1], # Predicted false and is true 'false_negative': matrix[1][0], # Predicted true and is false 'true_positive': matrix[1][1] # Predicted true and is true } scores = accuracy_score(y_test, y_pred), positive_metrics, negative_metrics, confusion # Get the accuracy, f1, precision and recall of the model return scores # Return it
def GCForest_prediction(feature_data, result_data): random_state = 2019 n_splits = 5 folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split( feature_data, result_data) test_pred = np.zeros(feature_data.shape[0]) test_proba = np.zeros(feature_data.shape[0]) acc_scores = np.zeros(n_splits) recall_scores = np.zeros(n_splits) mcc_scores = np.zeros(n_splits) f1_scores = np.zeros(n_splits) for j, (train_idx, test_idx) in enumerate(folds): X_train = feature_data[train_idx] Y_train = result_data[train_idx] X_test = feature_data[test_idx] Y_test = result_data[test_idx] config = get_toy_config() gc = GCForest(config) # should be a dict X_train_enc = gc.fit_transform(X_train, Y_train) part_X_train_enc = X_train_enc[:, ::2] y_pred = gc.predict(X_test) X_test_enc = gc.transform(X_test) part_X_test_enc = X_test_enc[:, ::2] y_proba = gc.predict_proba(X_test)[:, 1] acc = accuracy_score(Y_test, y_pred) print("Test Accuracy of GcForest (save and load) = {:.2f} %".format( acc * 100)) confmat = confusion_matrix(Y_test, y_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print('1. The acc score of the model {}\n'.format( accuracy_score(Y_test, y_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(Y_test, y_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(Y_test, y_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(Y_test, y_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(Y_test, y_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(Y_test, y_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(Y_test, y_pred))) recall = recall_score(Y_test, y_pred, average='macro') f1 = f1_score(Y_test, y_pred, average='macro') acc = accuracy_score(Y_test, y_pred) mcc = matthews_corrcoef(Y_test, y_pred) recall_scores[j] = recall f1_scores[j] = f1 acc_scores[j] = acc mcc_scores[j] = mcc test_pred[test_idx] = y_pred test_proba[test_idx] = y_proba print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}". format(j, recall, acc, mcc, f1)) confmat = confusion_matrix(result_data, test_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print( "--------------------------------------深度森林------------------------------------" ) print('1. The acc score of the model {}\n'.format( accuracy_score(result_data, test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(result_data, test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(result_data, test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(result_data, test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(result_data, test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(result_data, test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(result_data, test_pred)))
def GAGCForest_prediction(feature_data, result_data): n_splits = 5 acc_scores = np.zeros(n_splits) recall_scores = np.zeros(n_splits) mcc_scores = np.zeros(n_splits) f1_scores = np.zeros(n_splits) skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split( feature_data, result_data) new_test_pred = np.zeros(feature_data.shape[0]) new_test_proba = np.zeros(feature_data.shape[0]) for j, (train_idx, test_idx) in enumerate(skfolds): X_train = feature_data[train_idx] Y_train = result_data[train_idx] X_test = feature_data[test_idx] Y_test = result_data[test_idx] config = get_toy_config() gc = GCForest(config) # should be a dict X_train_enc = gc.fit_transform(X_train, Y_train) y_pred = gc.predict(X_test) X_test_enc = gc.transform(X_test) # 获取函数接口地址 AIM_M = __import__('aimfuc') AIM_F = 'gcforestCM' """============================变量设置============================""" w1 = [0, 1] w2 = [0, 1] w3 = [0, 1] b1 = [1, 1] b2 = [1, 1] b3 = [1, 1] ranges = np.vstack([w1, w2, w3]).T # 生成自变量的范围矩阵 borders = np.vstack([b1, b2, b3]).T # 生成自变量的边界矩阵 # ranges = np.vstack([np.zeros((1, 3)), np.ones((1, 3))]) # 生成自变量的范围矩阵 # print(shape(ranges)) # borders = np.vstack([.ones((1, 3)), np.ones((1, 3))]) # 生成自变量的边界矩阵 precisions = [6] * 3 # 自变量的编码精度 scales = [0] * 3 codes = [1] * 3 # print(np.ones((1, 300))) # scales = list(np.zeros((1, 300))) # 采用算术刻度 # codes = np.vstack([np.ones((1, 300)), np.ones((1, 300))]) # 变量的编码方式,2个变量均使用格雷编码 # print(shape(codes)) """========================遗传算法参数设置=========================""" # NIND = 50 # 种群规模 # MAXGEN = 100 # 最大遗传代数 # GGAP = 0.8 # 代沟:子代与父代个体不相同的概率为0.8 # selectStyle = 'sus'; # 遗传算法的选择方式设为"sus"——随机抽样选择 # recombinStyle = 'xovdp' # 遗传算法的重组方式,设为两点交叉 # recopt = 0.9 # 交叉概率 # pm = 0.1 # 变异概率 # SUBPOP = 1 # 设置种群数为1 # maxormin = 1 # # 设置最大最小化目标标记为1,表示是最小化目标,-1则表示最大化目标 FieldD = ga.crtfld(ranges, borders, precisions, codes, scales) # # 调用编程模板 [weightarray, pop_trace, var_trace, times] = new_code_templet(AIM_M, AIM_F, None, None, FieldD, problem='R', maxormin=-1, MAXGEN=10, NIND=50, SUBPOP=1, GGAP=0.8, selectStyle='sus', recombinStyle='xovsp', recopt=0.9, pm=0.7, distribute=True, proba=X_train_enc, result=Y_train, drawing=0) print('用时:', times, '秒') # w3 = 1 - weight[0] - weight[1] # print(weight) # weightarray = np.concatenate((weight, [w3]), axis=0) for element in weightarray: print(element) test_probaF = X_test_enc[:, ::2].T test_probaT = X_test_enc[:, 1::2].T test_predT = np.dot(weightarray, test_probaT) test_predF = np.dot(weightarray, test_probaF) test_pred = np.zeros(len(test_predT)) test_proba = np.zeros(len(test_predT)) for i in range(len(test_predT)): temper = test_predT[i] + test_predF[i] test_proba = test_predT / temper if (test_predT[i] > test_predF[i]): test_pred[i] = 1 else: test_pred[i] = 0 confmat = confusion_matrix(Y_test, test_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print('1. The acc score of the model {}\n'.format( accuracy_score(Y_test, test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(Y_test, test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(Y_test, test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(Y_test, test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(Y_test, test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(Y_test, test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(Y_test, test_pred))) recall = recall_score(Y_test, test_pred, average='macro') f1 = f1_score(Y_test, test_pred, average='macro') acc = accuracy_score(Y_test, test_pred) mcc = matthews_corrcoef(Y_test, test_pred) recall_scores[j] = recall f1_scores[j] = f1 acc_scores[j] = acc mcc_scores[j] = mcc new_test_pred[test_idx] = test_pred new_test_proba[test_idx] = test_proba print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}". format(j, recall, acc, mcc, f1)) new_confmat = confusion_matrix(result_data, new_test_pred) sn = new_confmat[1, 1] / (new_confmat[1, 0] + new_confmat[1, 1]) sp = new_confmat[0, 0] / (new_confmat[0, 0] + new_confmat[0, 1]) print( "---------------------------------遗传算法-----------------------------------------" ) print('1. The acc score of the model {}\n'.format( accuracy_score(result_data, new_test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(result_data, new_test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(result_data, new_test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(result_data, new_test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(result_data, new_test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(result_data, new_test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(result_data, new_test_pred)))
def run_classification_configuration( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer): folds_AUC_testing, folds_AUPR_testing = [], [] folds_AUC_training, folds_AUPR_training = [], [] folds_metrics3_training, folds_metrics3_testing = [], [] test_true_predict_compare, train_true_predict_compare = [], [] folds_recall_50, folds_recall_100 = [], [] for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer) gc = GCForest(config) print(config) X_train_enc = gc.fit_transform(X_train, y_train) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) temp = pd.DataFrame([y_test, y_predprob_test[:, 1], y_pred_test]).T.sort_values(by=1, ascending=False) recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0], temp.iloc[:50, :][2], pos_label=1, average='binary')[1] recall_100 = precision_recall_fscore_support(temp.iloc[:25, :][0], temp.iloc[:25, :][2], pos_label=1, average='binary')[1] test_true_predict_compare.append([ test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0], y_predprob_test[:, 1] ]) #10-cv train_true_predict_compare.append([ train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0], y_predprob_train[:, 1] ]) #10-cv precision_training, recall_training, _ = precision_recall_curve( y_train, y_predprob_train[:, 1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve( y_test, y_predprob_test[:, 1], pos_label=1) AUPR_training, AUPR_testing = auc(recall_training, precision_training), auc( recall_testing, precision_testing) AUC_training, AUC_testing = roc_auc_score( y_train, y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:, 1]) metrics3_testing = precision_recall_fscore_support( y_test, y_pred_test, pos_label=1, average='binary')[:3] metrics3_training = precision_recall_fscore_support( y_train, y_pred_train, pos_label=1, average='binary')[:3] folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_metrics3_testing.append(metrics3_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) folds_metrics3_training.append(metrics3_training) folds_recall_50.append(recall_50) folds_recall_100.append(recall_100) Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0) Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0) return [ Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing, Avg_metrics3_training, folds_recall_50, folds_recall_100 ], [test_true_predict_compare, train_true_predict_compare]
while t < top: index = topNIndex[t] x_n = x_u[index] X_n = np.vstack((X_n, x_n)) t += 1 X_n = X_n[1:, :] X_n = np.unique(X_n, axis=0) Y_n = np.zeros(X_n.shape[0]) X = np.concatenate((x_p, X_n), axis=0) Y = np.concatenate((y_p, Y_n), axis=0) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=1) #利用正样本和可靠负样本重新训练分类器 config = get_toy_config() gc = GCForest(config) gc.fit_transform(x_train, y_train) y_pred = gc.predict(x_test) # acc = accuracy_score(y_test, y_pred) # print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) i = 0 nfolds = 5 eRecalls = np.zeros(nfolds) ePrecisions = np.zeros(nfolds) ePRAUCs = np.zeros(nfolds) for i in range(nfolds): x_p_train, x_p_test, y_p_train, y_p_test = train_test_split( x_p, y_p, test_size=0.2) x_u_train, x_u_test, y_u_train, y_u_test = train_test_split( x_u, y_u, test_size=0.2) X_test = np.concatenate((x_p_test, x_u_test), axis=0) Y_test = np.concatenate((y_p_test, y_u_test), axis=0)
rs = np.random.randint(0, 1000, 1)[0] kf = StratifiedKFold(label[:, 0], n_folds=5, shuffle=True, random_state=rs) test_auc_fold = [] test_aupr_fold = [] for train_index, test_index in kf: Xtrain, Xtest = feature[train_index], feature[test_index] Ytrain, Ytest = label[train_index], label[test_index] config = get_toy_config() rf = GCForest(config) Ytrain = Ytrain.flatten() rf.fit_transform(Xtrain, Ytrain) # deep forest predict_y = rf.predict(Xtest) acc = accuracy_score(Ytest, predict_y) print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) prob_predict_y = rf.predict_proba( Xtest ) # Give a result with probability values,the probability sum is 1 predictions_validation = prob_predict_y[:, 1] fpr, tpr, _ = roc_curve(Ytest, predictions_validation) roc_auc = auc(fpr, tpr) aupr = average_precision_score(Ytest, predictions_validation) print(roc_auc) print(aupr) test_auc_fold.append(roc_auc) test_aupr_fold.append(aupr) plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold, tree, max_depth, layer): i = 0 folds_AUC_testing = [] folds_AUPR_testing = [] folds_AUC_training = [] folds_AUPR_training = [] folds_metrics3_training, folds_metrics3_testing = [], [] test_true_predict_compare = [] train_true_predict_compare = [] for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): config = get_toy_config(tree, max_depth, layer) gc = GCForest(config) # print(config) X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) test_true_predict_compare.append([ test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0], y_predprob_test[:, 1] ]) #10-cv train_true_predict_compare.append([ train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0], y_predprob_train[:, 1] ]) #10-cv precision_training, recall_training, _ = precision_recall_curve( y_train, y_predprob_train[:, 1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve( y_test, y_predprob_test[:, 1], pos_label=1) AUPR_training = auc(recall_training, precision_training) AUPR_testing = auc(recall_testing, precision_testing) AUC_training = roc_auc_score(y_train, y_predprob_train[:, 1]) AUC_testing = roc_auc_score(y_test, y_predprob_test[:, 1]) metrics3_testing = precision_recall_fscore_support( y_pred_test, y_test, pos_label=1, average='binary')[:3] metrics3_training = precision_recall_fscore_support( y_pred_train, y_train, pos_label=1, average='binary')[:3] folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_metrics3_testing.append(metrics3_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) folds_metrics3_training.append(metrics3_training) Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0) Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0) return [ Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing, Avg_metrics3_training ], [test_true_predict_compare, train_true_predict_compare]
for i in range(0, len(val_labels)): if val_labels[i] == 0: val_fea_0.append(val_fea[i]) else: val_fea_1.append(val_fea[i]) test_fea = val_fea_1[:int(len(val_fea_1) / 2)] + val_fea_0[:int(len(val_fea_0) / 2)] test_labels = [1] * int(len(val_fea_1) / 2) + [0] * int(len(val_fea_0) / 2) train_fea = val_fea_1[int(len(val_fea_1) / 2 ):] * 1 + val_fea_0[int(len(val_fea_0) / 2):] * 1 train_labels = [1] * (len(val_fea_1) - int(len(val_fea_1) / 2)) * 1 + [ 0 ] * (len(val_fea_0) - int(len(val_fea_0) / 2)) * 1 train_data = [[t, l] for t, l in zip(train_fea, train_labels)] test_data = [[d, l] for d, l in zip(test_fea, test_labels)] random.shuffle(train_data) random.shuffle(test_data) test_fea = [d[0] for d in test_data] test_labels = [d[1] for d in test_data] train_fea = [d[0] for d in train_data] train_labels = [d[1] for d in train_data] gc = GCForest(get_toy_config()) # should be a dict X_train_enc = gc.fit_transform(np.array(train_fea), np.array(train_labels)) i = 0 while os.path.exists('./gcForest_model/' + str(i)): i += 1 os.makedirs('./gcForest_model/' + str(i)) #pickle.dump(gc,open('./gcForest_model/'+ str(i)+'/model.pkl','wb+'),protocol=True) y_pred = gc.predict(np.array(test_fea)) print(classification_report(test_labels, y_pred))
config["cascade"] = ca_config return config config = get_toy_config(all_estimators=all_estimators) gc = GCForest(config) # If the model you use cost too much memory for you. # You can use these methods to force gcforest not keeping model in memory # gc.set_keep_model_in_mem(False), default is TRUE. n_test = 500 # (X_train, y_train), (X_test, y_test) = mnist.load_data() X_train, y_train = train_dataset_x[:-n_test], train_dataset_y[:-n_test] X_test_cv, y_test_cv = train_dataset_x[-n_test:], train_dataset_y[-n_test:] X_train = X_train[:, np.newaxis, :, :] X_test_cv = X_test_cv[:, np.newaxis, :, :] X_train_enc = gc.fit_transform(X_train, y_train) y_pred_cv = gc.predict(X_test_cv) acc = accuracy_score(y_test_cv, y_pred_cv) print("Test Accuracy CV of GcForest = {:.2f} %".format(acc * 100)) y_pred = gc.predict(X_test_preprocessed) acc = accuracy_score(y_test, y_pred) print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) # save the model to disk with open(pickle_name, "wb") as f: pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
x_train = X.iloc[train] y_train = Y[train] x_test = X.iloc[test] y_test = Y[test] x_train = x_train.values.reshape(-1, 1, len(x_train.columns)) x_test = x_test.values.reshape(-1, 1, len(x_test.columns)) X_train = x_train[:, np.newaxis, :, :] X_test = x_test[:, np.newaxis, :, :] X_train_enc = clf_gc.fit_transform(X_train, y_train) ############################### y_pred = clf_gc.predict(X_test) acc = accuracy_score(y_test, y_pred) gc_pred_acc.append(acc) ########################################################### X_test_enc = clf_gc.transform(X_test) X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1)) X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1)) X_train_origin = X_train.reshape((X_train.shape[0], -1)) X_test_origin = X_test.reshape((X_test.shape[0], -1)) X_train_enc = np.hstack((X_train_enc, X_train_origin)) X_test_enc = np.hstack((X_test_enc, X_test_origin)) clf = XGBClassifier(n_estimators=100, n_jobs=-1) clf.fit(X_train_enc, y_train) y_pred = clf.predict(X_test_enc) acc = accuracy_score(y_test, y_pred)
y_test = Y_train[int(i):int(j)] train = np.append(X_train[0:int(i)],X_train[int(j):],axis=0) y_train = np.append(Y_train[0:int(i)],Y_train[int(j):],axis=0) y_test_all.extend(list(y_test)) gc.fit_transform(train, y_train) y_predict = gc.predict(test) y_predict_all.extend(list(y_predict)) y_predict_prob = gc.predict_proba(test)[:,1] y_predict_prob_all.extend(list(y_predict_prob)) i+=length j+=length''' gc.fit_transform(X_train, Y_train) y_predict = gc.predict(X_test) y_predict_prob = gc.predict_proba(X_test)[:,1] acc = accuracy_score(Y_test, y_predict) print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100)) ROC_AUC_area=metrics.roc_auc_score(Y_test,y_predict_prob) print("ROC ="+str(ROC_AUC_area)) ACC=metrics.accuracy_score(Y_test,y_predict) print("ACC:"+str(ACC)) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y_test, y_predict) F1_Score=metrics.f1_score(Y_test, y_predict) F_measure=F1_Score MCC=metrics.matthews_corrcoef(Y_test, y_predict) pos=TP+FN neg=FP+TN savedata=[[['gcforest',ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]] easy_excel.save(classifier+"_crossvalidation",[str(X_train.shape[1])],savedata,'cross_validation_'+classifier+"_"+outputname+'.xls')
if __name__ == "__main__": name = 'vor3' pickle_in = open('../../datasets/train_set_'+name+'.p',"rb") train_set = pickle.load(pickle_in) pickle_in = open('../../datasets/train_label_'+name+'.p',"rb") train_label = pickle.load(pickle_in) pickle_in = open('../../datasets/test_set_'+name+'.p', "rb") test_set = pickle.load(pickle_in) pickle_in = open('../../datasets/test_label_'+name+'.p', "rb") test_label = pickle.load(pickle_in) train_set = train_set.to_numpy() train_label = train_label.to_numpy() test_set = test_set.to_numpy() test_label = test_label.to_numpy() print(len(np.unique(train_label)), len(np.unique(train_set))) print(train_label.shape, train_set.shape) config = get_config() gc = GCForest(config) print(config) X_train_enc = gc.fit_transform(train_set, train_label) title_model = '../results/trained_deep_forest_'+name+'.p' pickle.dump(gc, open(title_model, 'wb')) y_pred = gc.predict(test_set) acc = accuracy_score(test_label, y_pred) print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) print(acc) report = classification_report(test_label, y_pred) title = "report_deep_forest_"+name+".txt" write_report = open(title,"w") write_report.write(report)
config1 = load_json("/home/qiang/repo/python/experiment-gcForest/cascade_clf/examples/demo_ca.json") # If the model you use cost too much memory for you. # You can use these methods to force gcforest not keeping model in memory # gc.set_keep_model_in_mem(False), default is TRUE. config2 = get_toy_config() acc_st = [] acc_gc = [] acc_rf = [] for i in range(10): (X_train, y_train), (X_test, y_test) = uci_yeast.load_data() gc1 = GCForest(config1) gc1.fit_transform(X_train, y_train) y_pred = gc1.predict(X_test) acc = accuracy_score(y_test, y_pred) acc_st.append(acc) print("Test Accuracy of stacking GcForest = {:.2f} %".format(acc * 100)) # X_train, y_train = X_train[:2000], y_train[:2000] # X_train = X_train[:, np.newaxis, :] # X_test = X_test[:, np.newaxis, :] gc2 = GCForest(config2) gc2.fit_transform(X_train, y_train) # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model # X_enc.shape = # (n_datas, n_estimators * n_classes): If cascade is provided # (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
gc = GCForest(config) X_train_enc = gc.fit_transform(X_train_oversampled, y_train_oversampled) # dump with open("../pkl/2018_test.pkl", "wb") as f: pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL) # load with open("../pkl/2018_test.pkl", "rb") as f: gc = pickle.load(f) # #### test GcForest on valid datasets # In[22]: y_valid_pred = gc.predict(X_valid) print("============= 2018 datasets' results on valid =============") gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_valid, y_valid_pred) # # load 2018 Test datasets # In[23]: lines = open("../data/water/txt/2018waterDataTesting.txt").readlines() num_lines = len(lines) - 1 X_test = np.ones((num_lines, 9)) y_test = np.ones((num_lines, 1)) flag = 0 lines = np.delete(lines, 0, axis=0)
(X_train, y_train), (X_test, y_test) = mnist.load_data() # X_train, y_train = X_train[:2000], y_train[:2000] X_train = X_train[:, np.newaxis, :, :] X_test = X_test[:, np.newaxis, :, :] X_train_enc = gc.fit_transform(X_train, y_train) # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model # X_enc.shape = # (n_datas, n_estimators * n_classes): If cascade is provided # (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training. # X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) # WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use # gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model. y_pred = gc.predict(X_test) acc = accuracy_score(y_test, y_pred) print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) # You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF. X_test_enc = gc.transform(X_test) X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1)) X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1)) X_train_origin = X_train.reshape((X_train.shape[0], -1)) X_test_origin = X_test.reshape((X_test.shape[0], -1)) X_train_enc = np.hstack((X_train_origin, X_train_enc)) X_test_enc = np.hstack((X_test_origin, X_test_enc)) print("X_train_enc.shape={}, X_test_enc.shape={}".format( X_train_enc.shape, X_test_enc.shape)) clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1) clf.fit(X_train_enc, y_train)
def load_json(path): import json """ 支持以//开头的注释 """ lines = [] with open(path) as f: for row in f.readlines(): if row.strip().startswith("//"): continue lines.append(row) return json.loads("\n".join(lines)) X, Y = load.obesity_data() x_tr,x_te,y_tr,y_te = train_test_split(X,Y,random_state=42,stratify=Y) clf_rf = RandomForestClassifier(n_estimators=200, random_state=0) clf_rf.fit(x_tr,y_tr) y_pred = clf_rf.predict(x_te) print(accuracy_score(y_te,y_pred)) config = load_json("/home/qiang/repo/python/cascade_clf/examples/demo_ca.json") clf_gc = GCForest(config) clf_gc.fit_transform(x_tr.values, y_tr) y_pred = clf_gc.predict(x_te.values) print(accuracy_score(y_te, y_pred))
y_train2 = y_train2.values y_train = y_train.values y_valid = y_valid.values y_test = y_test.values # In[90]: config = get_toy_config() model = GCForest(config) model.fit_transform(X_train2, y_train2, X_test, y_test) gc_valid_proba = model.predict_proba(X_valid) gc_pred = model.predict(X_valid) # In[14]: models = [ LogisticRegression(), LinearDiscriminantAnalysis(), SVC(probability=True), DecisionTreeClassifier(), ExtraTreeClassifier(), GaussianNB(), KNeighborsClassifier(), RandomForestClassifier(random_state=random_seed), ExtraTreesClassifier(random_state=random_seed),
def run_classification_configuration( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer, mode, seed): folds_AUC_testing, folds_AUPR_testing = [], [] folds_AUC_training, folds_AUPR_training = [], [] folds_metrics3_training, folds_metrics3_testing = [], [] test_true_predict_compare, train_true_predict_compare = [], [] folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], [] folds_G_mean = [] i = 0 for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer) gc = GCForest(config) X_train_enc = gc.fit_transform(X_train, y_train) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) temp = pd.DataFrame([y_test, y_predprob_test[:, 1], y_pred_test]).T.sort_values(by=1, ascending=False) recall_25 = precision_recall_fscore_support(temp.iloc[:25, :][0], temp.iloc[:25, :][2], pos_label=1, average='binary')[1] recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0], temp.iloc[:50, :][2], pos_label=1, average='binary')[1] test_true_predict_compare.append([ test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0], y_predprob_test[:, 1] ]) #10-cv train_true_predict_compare.append([ train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0], y_predprob_train[:, 1] ]) #10-cv precision_training, recall_training, _ = precision_recall_curve( y_train, y_predprob_train[:, 1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve( y_test, y_predprob_test[:, 1], pos_label=1) AUPR_training, AUPR_testing = auc(recall_training, precision_training), auc( recall_testing, precision_testing) AUC_training, AUC_testing = roc_auc_score( y_train, y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:, 1]) metrics3_testing = precision_recall_fscore_support( y_test, y_pred_test, pos_label=1, average='binary')[:3] metrics3_training = precision_recall_fscore_support( y_train, y_pred_train, pos_label=1, average='binary')[:3] tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test, labels=[0, 1]).ravel() specificity = float(tn) / float(tn + fp) recall = metrics3_testing[1] G_mean = np.sqrt(recall * specificity) folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_metrics3_testing.append(metrics3_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) folds_metrics3_training.append(metrics3_training) folds_G_mean.append(G_mean) folds_recall_25.append(recall_25) folds_recall_50.append(recall_50) i += 1 Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0) Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0) Avg_G_mean = np.mean(folds_G_mean) return [ Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, #012 Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, #345 folds_AUPR_training, folds_AUC_training, #67 Avg_metrics3_testing, Avg_metrics3_training, #89 folds_recall_25, folds_recall_50, folds_G_mean ], [test_true_predict_compare, train_true_predict_compare ] #folds_recall_100, folds_recall_200, folds_recall_400,