def df(x_train, y_train, n_features): config = load_json("demo_ca.json") gc = GCForest(config) X_train = x_train.values.reshape(-1, 1, len(x_train.columns)) _, _features = gc.fit_transform(X_train, y_train) _features = _features.sort_values(ascending=False) return _features.index.values.tolist()[:n_features]
def RUN_2(best_th): #主函数,在获得最优分类阈值的情况下计算模型在测试集上的预测结果 comm_s_TPR = [] comm_s_TNR = [] comm_s_BER = [] comm_s_ACC = [] comm_s_MCC = [] comm_s_F1score = [] comm_s_AUC = [] comm_s_time = [] #将原始数据分为训练集,测试集 tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split( comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1], comtest.iloc[0:len(comtest), -1], test_size=0.2, random_state=0) x_train = tiaocan_train y_train = tiaocan_train_test x_test = ceshi_train y_true = ceshi_true x_train = np.array(x_train, dtype=np.float16) y_train = np.array(y_train, dtype=np.float16) x_test = np.array(x_test, dtype=np.float16) y_true = np.array(y_true, dtype=np.float16) # x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train) #对训练集使用欠采样的方法达到类平衡 # 设置机器学习模型 ########################################################################################################################## ############################## --XGB-- ############################# comm = GCForest(config) comm.fit_transform(x_train, y_train) pro_comm_Pre = comm.predict_proba(x_test) blo_comm_Pre = blo(pro_comm_Pre, best_th) #根据最优分类阈值与预测概率计算画着生死情况 eva_comm = evaluating_indicator(y_true=y_true, y_test=blo_comm_Pre, y_test_value=pro_comm_Pre) comm_s_TPR.append(eva_comm['TPR']) comm_s_TNR.append(eva_comm['TNR']) comm_s_BER.append(eva_comm['BER']) comm_s_ACC.append(eva_comm['ACC']) comm_s_MCC.append(eva_comm['MCC']) comm_s_F1score.append(eva_comm['F1_score']) comm_s_AUC.append(eva_comm['AUC']) eva_comm = { "TPR": np.mean(comm_s_TPR), "TNR": np.mean(comm_s_TNR), "BER": np.mean(comm_s_BER), "ACC": np.mean(comm_s_ACC), "MCC": np.mean(comm_s_MCC), "F1_score": np.mean(comm_s_F1score), "AUC": np.mean(comm_s_AUC), "time": np.mean(comm_s_time) } return eva_comm
def get_m_gcForest(mtype="ca"): """ @param mtype: "ca" or "gc" @param n_esti: n_estimators param in get_ca_config """ if mtype == "ca": config = get_ca_config() gc = GCForest(config) return gc if mtype == "gc": config = get_gc_config() gc = GCForest(config) return gc
def run_model(features, adhd_labels, rand_params, verbose=True, test_size=0.2): """ Run the gcForest using parameters from the Optimizer. Use random portions of the original dataset for testing and training (default 20%-80%) :param kind: (str) The type of functional connectivity we want to use :param features: (list) A matrix containing phenotypic and functional connectivity c :param adhd_labels: (list) The correct labels from the dataset :param rand_params: (dict) The generated random params from the Optimizer :param verbose: (bool) Whether to print classification report :param test_size: (float) How much of the dataset to use for testing :return: (float) accuracy, (float) f1, (float) precision, (float) recall """ classifier = GCForest( # Instantiate the gcForest algorithm using the random parameters we generated config=generate_gcforest_config(rand_params['mlp_layers'], rand_params['mlp_solver'], rand_params['logistic_regressions'], rand_params['svc_kernel'], rand_params['xgb_estimators'], rand_params['rf_estimators'], rand_params['early_stopping_iterations'], rand_params['positions']), ) X_train, X_test, y_train, y_test = train_test_split(features, adhd_labels, test_size=test_size) # Split the data into random subsets (20% test, 80% train by default) classifier.fit_transform(np.array(X_train), np.array(y_train)) # Train the gcForest model y_pred = classifier.predict(np.array(X_test)) # Predict off of the test dataset y_test = np.array(y_test) if verbose: print "Classification Report\n", classification_report(y_test, y_pred) # Print out some useful run information print "Accuracy:", accuracy_score(y_test, y_pred) print "Confusion Matrix\n", confusion_matrix(y_test, y_pred) positive_metrics = { 'f1': f1_score(y_test, y_pred), # Calculate the f1 for class "1" 'precision': precision_score(y_test, y_pred), # Calculate the precision for class "1" 'recall': recall_score(y_test, y_pred), # Calculate the recall for class "1" } negative_metrics = { 'f1': f1_score(y_test, y_pred, pos_label=0), # Calculate the f1 for class "0" 'precision': precision_score(y_test, y_pred, pos_label=0), # Calculate the precision for class "0" 'recall': recall_score(y_test, y_pred, pos_label=0), # Calculate the recall for class "0" } matrix = confusion_matrix(y_test, y_pred) confusion = { # Return the attributes of the confusion matrix 'true_negative': matrix[0][0], # Predicted false and is false 'false_positive': matrix[0][1], # Predicted false and is true 'false_negative': matrix[1][0], # Predicted true and is false 'true_positive': matrix[1][1] # Predicted true and is true } scores = accuracy_score(y_test, y_pred), positive_metrics, negative_metrics, confusion # Get the accuracy, f1, precision and recall of the model return scores # Return it
def RUN(): #根据训练集与验证集获取最优分类阈值 tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split( comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1], comtest.iloc[0:len(comtest), -1], test_size=0.2, random_state=0) position = [] skf = StratifiedKFold(n_splits=10) #设置十折交叉验证 tiaocan_train = np.array(tiaocan_train, dtype=np.float16) tiaocan_train_test = np.array(tiaocan_train_test, dtype=np.float16) times = 0 position = [] for train, test in skf.split(tiaocan_train, tiaocan_train_test): alltime_start = time.time() times = times + 1 x_train = tiaocan_train[train] y_train = tiaocan_train_test[train] x_test = tiaocan_train[test] y_true = tiaocan_train_test[test] # x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train) #使用欠采样的方法进行类平衡 # 设置机器学习模型 # ############################## --XGB-- ############################# comm = GCForest(config) comm.fit_transform(x_train, y_train) #模型训练 pro_comm_Pre = comm.predict_proba(x_test) ############################### 敏感性特异性相近 ######################################## RightIndex = [] for jj in range(100): #计算模型在不同分类阈值下的各项指标 blo_comm_Pre = blo(pro_comm_Pre, jj) eva_comm = evaluating_indicator(y_true=y_true, y_test=blo_comm_Pre, y_test_value=pro_comm_Pre) RightIndex.append(abs(eva_comm['TPR'] - eva_comm['TNR'])) RightIndex = np.array(RightIndex, dtype=np.float16) position = np.argmin(RightIndex) #选择出使得敏感性特异性最小的阈值作为分类阈值输出 alltime_end = time.time() print('done_0, 第%s次验证 , time: %s s ' % (times, alltime_end - alltime_start)) ###################################################################################### return position.mean() #计算交叉验证输出的多个阈值的平均值作为最优分类阈值
def cross_validation(X, y, k, cpu): config = get_toy_config(cpu = cpu) classifier = GCForest(config) cv = StratifiedKFold(n_splits = k) res = {} i=1 for train, test in cv.split(X, y): tt = classifier.fit_transform(X[train], y[train]) yscore = classifier.predict_proba(X[test]) tmpID = "fold_" + str(i) curDic = {} curDic["yscore"] = yscore curDic["ytest"] = y[test] res[tmpID] = curDic i = i + 1 return res
def run_gcforest(train_X, test_X, train_y, test_y, rounds=3, layers=100, seed=0): config = get_toy_config(rounds, layers, seed) gc = GCForest(config) # should be a dict X_train_enc = gc.fit_transform(train_X, train_y) ypred = np.array([i[1] for i in gc.predict_proba(test_X)]) metrics = gen_eval_metrics(test_y, ypred) accuracy = metrics[0] #cor = sum([int(ypred[i] + 0.5) == test_y[i] for i in range(len(ypred))]) #accuracy = cor / len(test_y) print('Fold accuracy: ' + str(accuracy)) return metrics
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001): i = 0 folds_AUC_testing = [] folds_AUPR_testing = [] folds_AUC_training = [] folds_AUPR_training = [] test_true_predict_compare = [] train_true_predict_compare = [] for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): # X_train, X_test = X_train[:,12:], X_test[:,12:] # X_train, X_test = X_train[:,:12], X_test[:,:12] config = get_toy_config(trees,max_depth, min_child_weight,cw,layer) gc = GCForest(config) #print(config) X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) y_predprob_test_df = pd.DataFrame(y_predprob_test) y_predprob_train_df = pd.DataFrame(y_predprob_train) test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1) AUPR_training = auc(recall_training,precision_training) AUPR_testing = auc(recall_testing, precision_testing) AUC_training = roc_auc_score(y_train, y_predprob_train[:,1]) AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1]) folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]
def fit(self, xtrain: pd.DataFrame, ytrain: pd.Series): """ Fit model :param xtrain: training features :param ytrain: training labels """ clf = GCForest(self.config) if self.scaler is None: clf.fit_transform(xtrain, ytrain) else: xtrain_norm = self.scaler.fit_transform(xtrain) clf.fit_transform(xtrain_norm, ytrain) return DeepRandomForestModel(clf, self.scaler)
# -*- coding:utf-8 -*- import pandas as pd from utils import avg_importance from sklearn.model_selection import StratifiedKFold import gcforest.data_load as load from gcforest.gcforest import GCForest import utils cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) config = utils.load_json("demo_ca.json") gc = GCForest(config) datasets = ['cirrhosis', 'obesity', 't2d'] for dataset_idx, name in enumerate(datasets): thre_features = {} X = None Y = None if name == 'cirrhosis': X, Y = load.cirrhosis_data() elif name == 't2d': X, Y = load.t2d_data() elif name == 'obesity': X, Y = load.obesity_data() else: raise Exception('the dataset is not defined!!!')
loocv = LeaveOneOut() clf_rf = RandomForestClassifier(n_estimators=50, random_state=0) clf_svm = SVC(kernel='linear', C=1, gamma=0.001, random_state=0, probability=True) # xgb_crf = XGBClassifier(n_estimators=50) # config = gcforest_config() config = load_json("gc.json") clf_gc = GCForest(config) gc_pred_acc = [] # # ============================================== f, ax = plt.subplots(1, 1) params = [(clf_rf, 'green', "Random Forest"), (clf_svm, 'black', "SVM"), (clf_gc, 'red', "Deep Forest")] # (xgb_crf,'purple', "XGBoosing")] # params = [(clf_gc,'red',"Deep Forest")] for x in params: mean_fpr = np.linspace(0, 1, 100) tprs = [] aucs = [] i = 1 for train, test in loocv.split(X, Y): probas_ = None
# In[12]: config = get_toy_config() models = [ LogisticRegression(), LinearDiscriminantAnalysis(), SVC(probability=True), DecisionTreeClassifier(), ExtraTreeClassifier(), GaussianNB(), KNeighborsClassifier(), RandomForestClassifier(random_state=random_seed), ExtraTreesClassifier(random_state=random_seed), GCForest(config) ] # In[16]: test_entries = [] train_entries = [] for model in models: model_name = model.__class__.__name__ if model_name == 'GCForest': model.fit_transform(X_train, y_train, X_test, y_test) else: model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test)
X_train_oversampled_batch, y_train_oversampled_batch = Batch( X_train_oversampled, y_train_oversampled, batch_size) X_train_batch, y_train_batch = Batch(X_train, y_train, batch_size) X_valid_batch, y_valid_batch = Batch(X_valid, y_valid, batch_size) # # GcForest # # ## train gc # #### 1.train GcForest on oversampled datasets # In[21]: config = get_toy_config() gc = GCForest(config) X_train_enc = gc.fit_transform(X_train_oversampled, y_train_oversampled) # dump with open("../pkl/2018_test.pkl", "wb") as f: pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL) # load with open("../pkl/2018_test.pkl", "rb") as f: gc = pickle.load(f) # #### test GcForest on valid datasets # In[22]: y_valid_pred = gc.predict(X_valid)
"type": "LogisticRegression" }) else: ca_config["estimators"].append({ "n_folds": 2, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1 }) config["cascade"] = ca_config return config config = get_toy_config(all_estimators=all_estimators) gc = GCForest(config) # If the model you use cost too much memory for you. # You can use these methods to force gcforest not keeping model in memory # gc.set_keep_model_in_mem(False), default is TRUE. n_test = 500 # (X_train, y_train), (X_test, y_test) = mnist.load_data() X_train, y_train = train_dataset_x[:-n_test], train_dataset_y[:-n_test] X_test_cv, y_test_cv = train_dataset_x[-n_test:], train_dataset_y[-n_test:] X_train = X_train[:, np.newaxis, :, :] X_test_cv = X_test_cv[:, np.newaxis, :, :] X_train_enc = gc.fit_transform(X_train, y_train) y_pred_cv = gc.predict(X_test_cv) acc = accuracy_score(y_test_cv, y_pred_cv)
feature = np.vstack((positive_feature, negative_sample_feature)) label1 = np.ones((len(positive_feature), 1)) label0 = np.zeros((len(negative_sample_feature), 1)) label = np.vstack((label1, label0)) rs = np.random.randint(0, 1000, 1)[0] kf = StratifiedKFold(label[:, 0], n_folds=5, shuffle=True, random_state=rs) test_auc_fold = [] test_aupr_fold = [] for train_index, test_index in kf: Xtrain, Xtest = feature[train_index], feature[test_index] Ytrain, Ytest = label[train_index], label[test_index] config = get_toy_config() rf = GCForest(config) Ytrain = Ytrain.flatten() rf.fit_transform(Xtrain, Ytrain) # deep forest predict_y = rf.predict(Xtest) acc = accuracy_score(Ytest, predict_y) print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) prob_predict_y = rf.predict_proba( Xtest ) # Give a result with probability values,the probability sum is 1 predictions_validation = prob_predict_y[:, 1] fpr, tpr, _ = roc_curve(Ytest, predictions_validation) roc_auc = auc(fpr, tpr) aupr = average_precision_score(Ytest, predictions_validation) print(roc_auc)
ca_config["estimators"].append({ "n_folds": 5, "type": "LogisticRegression" }) config["cascade"] = ca_config return config if __name__ == "__main__": args = parse_args() if args.model is None: config = get_toy_config() else: config = load_json(args.model) gc = GCForest(config) # If the model you use cost too much memory for you. # You can use these methods to force gcforest not keeping model in memory gc.set_keep_model_in_mem(False) # default is TRUE. (X_train, y_train), (X_test, y_test) = mnist.load_data() # X_train, y_train = X_train[:2000], y_train[:2000] X_train = X_train[:, np.newaxis, :, :] X_test = X_test[:, np.newaxis, :, :] X_train_enc = gc.fit_transform(X_train, y_train) # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model # X_enc.shape = # (n_datas, n_estimators * n_classes): If cascade is provided # (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
def GAGCForest_prediction(feature_data, result_data): n_splits = 5 acc_scores = np.zeros(n_splits) recall_scores = np.zeros(n_splits) mcc_scores = np.zeros(n_splits) f1_scores = np.zeros(n_splits) skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split( feature_data, result_data) new_test_pred = np.zeros(feature_data.shape[0]) new_test_proba = np.zeros(feature_data.shape[0]) for j, (train_idx, test_idx) in enumerate(skfolds): X_train = feature_data[train_idx] Y_train = result_data[train_idx] X_test = feature_data[test_idx] Y_test = result_data[test_idx] config = get_toy_config() gc = GCForest(config) # should be a dict X_train_enc = gc.fit_transform(X_train, Y_train) y_pred = gc.predict(X_test) X_test_enc = gc.transform(X_test) # 获取函数接口地址 AIM_M = __import__('aimfuc') AIM_F = 'gcforestCM' """============================变量设置============================""" w1 = [0, 1] w2 = [0, 1] w3 = [0, 1] b1 = [1, 1] b2 = [1, 1] b3 = [1, 1] ranges = np.vstack([w1, w2, w3]).T # 生成自变量的范围矩阵 borders = np.vstack([b1, b2, b3]).T # 生成自变量的边界矩阵 # ranges = np.vstack([np.zeros((1, 3)), np.ones((1, 3))]) # 生成自变量的范围矩阵 # print(shape(ranges)) # borders = np.vstack([.ones((1, 3)), np.ones((1, 3))]) # 生成自变量的边界矩阵 precisions = [6] * 3 # 自变量的编码精度 scales = [0] * 3 codes = [1] * 3 # print(np.ones((1, 300))) # scales = list(np.zeros((1, 300))) # 采用算术刻度 # codes = np.vstack([np.ones((1, 300)), np.ones((1, 300))]) # 变量的编码方式,2个变量均使用格雷编码 # print(shape(codes)) """========================遗传算法参数设置=========================""" # NIND = 50 # 种群规模 # MAXGEN = 100 # 最大遗传代数 # GGAP = 0.8 # 代沟:子代与父代个体不相同的概率为0.8 # selectStyle = 'sus'; # 遗传算法的选择方式设为"sus"——随机抽样选择 # recombinStyle = 'xovdp' # 遗传算法的重组方式,设为两点交叉 # recopt = 0.9 # 交叉概率 # pm = 0.1 # 变异概率 # SUBPOP = 1 # 设置种群数为1 # maxormin = 1 # # 设置最大最小化目标标记为1,表示是最小化目标,-1则表示最大化目标 FieldD = ga.crtfld(ranges, borders, precisions, codes, scales) # # 调用编程模板 [weightarray, pop_trace, var_trace, times] = new_code_templet(AIM_M, AIM_F, None, None, FieldD, problem='R', maxormin=-1, MAXGEN=10, NIND=50, SUBPOP=1, GGAP=0.8, selectStyle='sus', recombinStyle='xovsp', recopt=0.9, pm=0.7, distribute=True, proba=X_train_enc, result=Y_train, drawing=0) print('用时:', times, '秒') # w3 = 1 - weight[0] - weight[1] # print(weight) # weightarray = np.concatenate((weight, [w3]), axis=0) for element in weightarray: print(element) test_probaF = X_test_enc[:, ::2].T test_probaT = X_test_enc[:, 1::2].T test_predT = np.dot(weightarray, test_probaT) test_predF = np.dot(weightarray, test_probaF) test_pred = np.zeros(len(test_predT)) test_proba = np.zeros(len(test_predT)) for i in range(len(test_predT)): temper = test_predT[i] + test_predF[i] test_proba = test_predT / temper if (test_predT[i] > test_predF[i]): test_pred[i] = 1 else: test_pred[i] = 0 confmat = confusion_matrix(Y_test, test_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print('1. The acc score of the model {}\n'.format( accuracy_score(Y_test, test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(Y_test, test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(Y_test, test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(Y_test, test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(Y_test, test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(Y_test, test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(Y_test, test_pred))) recall = recall_score(Y_test, test_pred, average='macro') f1 = f1_score(Y_test, test_pred, average='macro') acc = accuracy_score(Y_test, test_pred) mcc = matthews_corrcoef(Y_test, test_pred) recall_scores[j] = recall f1_scores[j] = f1 acc_scores[j] = acc mcc_scores[j] = mcc new_test_pred[test_idx] = test_pred new_test_proba[test_idx] = test_proba print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}". format(j, recall, acc, mcc, f1)) new_confmat = confusion_matrix(result_data, new_test_pred) sn = new_confmat[1, 1] / (new_confmat[1, 0] + new_confmat[1, 1]) sp = new_confmat[0, 0] / (new_confmat[0, 0] + new_confmat[0, 1]) print( "---------------------------------遗传算法-----------------------------------------" ) print('1. The acc score of the model {}\n'.format( accuracy_score(result_data, new_test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(result_data, new_test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(result_data, new_test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(result_data, new_test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(result_data, new_test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(result_data, new_test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(result_data, new_test_pred)))
def load_json(path): import json """ 支持以//开头的注释 """ lines = [] with open(path) as f: for row in f.readlines(): if row.strip().startswith("//"): continue lines.append(row) return json.loads("\n".join(lines)) X, Y = load.obesity_data() x_tr,x_te,y_tr,y_te = train_test_split(X,Y,random_state=42,stratify=Y) clf_rf = RandomForestClassifier(n_estimators=200, random_state=0) clf_rf.fit(x_tr,y_tr) y_pred = clf_rf.predict(x_te) print(accuracy_score(y_te,y_pred)) config = load_json("/home/qiang/repo/python/cascade_clf/examples/demo_ca.json") clf_gc = GCForest(config) clf_gc.fit_transform(x_tr.values, y_tr) y_pred = clf_gc.predict(x_te.values) print(accuracy_score(y_te, y_pred))
# load gc config # In[12]: y_train2 = y_train2.values y_train = y_train.values y_valid = y_valid.values y_test = y_test.values # In[90]: config = get_toy_config() model = GCForest(config) model.fit_transform(X_train2, y_train2, X_test, y_test) gc_valid_proba = model.predict_proba(X_valid) gc_pred = model.predict(X_valid) # In[14]: models = [ LogisticRegression(), LinearDiscriminantAnalysis(), SVC(probability=True), DecisionTreeClassifier(), ExtraTreeClassifier(),
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import scale, StandardScaler from sklearn.metrics import roc_curve, auc from dimension_reduction import KPCA, LLE, pca import utils.tools as utils from gcforest.gcforest import GCForest from gcforest.utils.config_utils import load_json start = time.time() path1 = 'gcforest4.json' config = load_json(path1) gc = GCForest(config) extraction = sio.loadmat('yeast_feature_end.mat') proteinA = extraction.get('feature_A') protein_A = np.array(proteinA) proteinB = extraction.get('feature_B') protein_B = np.array(proteinB) X_ = np.concatenate((protein_A, protein_B), axis=1) X_ = np.array(X_) [row, column] = np.shape(X_) label_P = np.ones(int(row / 2)) label_N = np.zeros(int(row / 2)) label_ = np.hstack((label_P, label_N)) y_raw = np.mat(label_) y_raw = np.transpose(y_raw) y_ = np.array(y_raw)
# ca_config["estimators"].append( # {"n_folds": 3, "type": "RandomForestClassifier", "n_estimators": 10, "n_jobs": -1,"random_state":0}) # ca_config["estimators"].append( # {"n_folds": 3, "type": "XGBClassifier", "n_estimators": 10, # "silent": True, "nthread": -1, "learning_rate": 0.1} ) # ca_config["estimators"].append({"n_folds": 3, "type": "ExtraTreesClassifier","max_depth": None, "n_jobs": -1}) # ca_config["estimators"].append({"n_folds": 3, "type": "LogisticRegression"}) config["cascade"] = ca_config return config if __name__ == "__main__": X, Y = load2.cirrhosis_data() config = gcforest_config() gc = GCForest(config) AUCs = [] for i in range(10): cv = StratifiedKFold(n_splits=10, shuffle=True) # # ============================================== mean_fpr = np.linspace(0, 1, 100) tprs = [] aucs = [] for train, test in cv.split(X, Y): x_train = X.iloc[train] y_train = Y[train] x_test = X.iloc[test] y_test = Y[test]
# test = pd.read_csv("../data/water/csv/test2018.csv") # X_test = test.values[:, 0:-1] # y_test = test.values[:, -1] # X_test = clean_pipeline.fit_transform(X_test) # In[13]: # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, stratify = y, random_state = random_seed) # X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True) config = get_toy_config() gc = GCForest(config) gc.fit_transform(X_train_oversampled, y_train_oversampled, X_valid, y_valid) # y_valid_pred = gc.predict(X_valid) # In[13]: # dump with open("../pkl/2019_gc.pkl", "wb") as f: pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL) # # load # with open("../pkl/2018_gc.pkl", "rb") as f: # gc = pickle.load(f)
x_u_s = np.concatenate((x_p_test, x_u), axis=0) y_u_s = np.concatenate((y_p_test, y_u), axis=0) y_u_s = np.zeros(y_u_s.shape[0]) x = np.concatenate((x_p_s, x_u_s), axis=0) y = np.concatenate((y_p_s, y_u_s), axis=0) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) # scaler = StandardScaler().fit(X_train) # X_train_transformed = scaler.transform(X_train) # X_test_transformed = scaler.transform(X_test) config = get_toy_config() gc = GCForest(config) gc.fit_transform(x_train, y_train) scores = gc.predict_proba(x_u_test)[:, 0] orderScores = np.argsort(-scores) orderList = [str(item) for item in orderScores] orderStr = ','.join(orderList) top = int(y_u_test.shape[0] * 0.25) topNIndex = orderScores[:top] t = 0 while t < top: index = topNIndex[t] x_n = x_u[index] X_n = np.vstack((X_n, x_n)) t += 1 X_n = X_n[1:, :]
ca_config["estimators"].append({ "n_folds": 5, "type": "LogisticRegression" }) config["cascade"] = ca_config return config if __name__ == "__main__": args = parse_args() if args.model is None: config = get_toy_config() else: config = load_json(args.model) gc = GCForest(config) # If the model you use cost too much memory for you. # You can use these methods to force gcforest not keeping model in memory # gc.set_keep_model_in_mem(False), default is TRUE. (X_train, y_train), (X_test, y_test) = mnist.load_data() # X_train, y_train = X_train[:2000], y_train[:2000] X_train = X_train[:, np.newaxis, :, :] X_test = X_test[:, np.newaxis, :, :] X_train_enc = gc.fit_transform(X_train, y_train) # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model # X_enc.shape = # (n_datas, n_estimators * n_classes): If cascade is provided # (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
def GCForest_prediction(feature_data, result_data): random_state = 2019 n_splits = 5 folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split( feature_data, result_data) test_pred = np.zeros(feature_data.shape[0]) test_proba = np.zeros(feature_data.shape[0]) acc_scores = np.zeros(n_splits) recall_scores = np.zeros(n_splits) mcc_scores = np.zeros(n_splits) f1_scores = np.zeros(n_splits) for j, (train_idx, test_idx) in enumerate(folds): X_train = feature_data[train_idx] Y_train = result_data[train_idx] X_test = feature_data[test_idx] Y_test = result_data[test_idx] config = get_toy_config() gc = GCForest(config) # should be a dict X_train_enc = gc.fit_transform(X_train, Y_train) part_X_train_enc = X_train_enc[:, ::2] y_pred = gc.predict(X_test) X_test_enc = gc.transform(X_test) part_X_test_enc = X_test_enc[:, ::2] y_proba = gc.predict_proba(X_test)[:, 1] acc = accuracy_score(Y_test, y_pred) print("Test Accuracy of GcForest (save and load) = {:.2f} %".format( acc * 100)) confmat = confusion_matrix(Y_test, y_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print('1. The acc score of the model {}\n'.format( accuracy_score(Y_test, y_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(Y_test, y_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(Y_test, y_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(Y_test, y_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(Y_test, y_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(Y_test, y_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(Y_test, y_pred))) recall = recall_score(Y_test, y_pred, average='macro') f1 = f1_score(Y_test, y_pred, average='macro') acc = accuracy_score(Y_test, y_pred) mcc = matthews_corrcoef(Y_test, y_pred) recall_scores[j] = recall f1_scores[j] = f1 acc_scores[j] = acc mcc_scores[j] = mcc test_pred[test_idx] = y_pred test_proba[test_idx] = y_proba print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}". format(j, recall, acc, mcc, f1)) confmat = confusion_matrix(result_data, test_pred) sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1]) sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1]) print( "--------------------------------------深度森林------------------------------------" ) print('1. The acc score of the model {}\n'.format( accuracy_score(result_data, test_pred))) print('2. The sp score of the model {}\n'.format(sp)) print('3. The sn score of the model {}\n'.format(sn)) print('4. The mcc score of the model {}\n'.format( matthews_corrcoef(result_data, test_pred))) print('9. The auc score of the model {}\n'.format( roc_auc_score(result_data, test_proba, average='macro'))) print('6. The recall score of the model {}\n'.format( recall_score(result_data, test_pred, average='macro'))) print('5. The F-1 score of the model {}\n'.format( f1_score(result_data, test_pred, average='macro'))) print('7. Classification report \n {} \n'.format( classification_report(result_data, test_pred))) print('8. Confusion matrix \n {} \n'.format( confusion_matrix(result_data, test_pred)))
for i in range(0, len(val_labels)): if val_labels[i] == 0: val_fea_0.append(val_fea[i]) else: val_fea_1.append(val_fea[i]) test_fea = val_fea_1[:int(len(val_fea_1) / 2)] + val_fea_0[:int(len(val_fea_0) / 2)] test_labels = [1] * int(len(val_fea_1) / 2) + [0] * int(len(val_fea_0) / 2) train_fea = val_fea_1[int(len(val_fea_1) / 2 ):] * 1 + val_fea_0[int(len(val_fea_0) / 2):] * 1 train_labels = [1] * (len(val_fea_1) - int(len(val_fea_1) / 2)) * 1 + [ 0 ] * (len(val_fea_0) - int(len(val_fea_0) / 2)) * 1 train_data = [[t, l] for t, l in zip(train_fea, train_labels)] test_data = [[d, l] for d, l in zip(test_fea, test_labels)] random.shuffle(train_data) random.shuffle(test_data) test_fea = [d[0] for d in test_data] test_labels = [d[1] for d in test_data] train_fea = [d[0] for d in train_data] train_labels = [d[1] for d in train_data] gc = GCForest(get_toy_config()) # should be a dict X_train_enc = gc.fit_transform(np.array(train_fea), np.array(train_labels)) i = 0 while os.path.exists('./gcForest_model/' + str(i)): i += 1 os.makedirs('./gcForest_model/' + str(i)) #pickle.dump(gc,open('./gcForest_model/'+ str(i)+'/model.pkl','wb+'),protocol=True) y_pred = gc.predict(np.array(test_fea)) print(classification_report(test_labels, y_pred))
def run_classification_configuration( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer): folds_AUC_testing, folds_AUPR_testing = [], [] folds_AUC_training, folds_AUPR_training = [], [] folds_metrics3_training, folds_metrics3_testing = [], [] test_true_predict_compare, train_true_predict_compare = [], [] folds_recall_50, folds_recall_100 = [], [] for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer) gc = GCForest(config) print(config) X_train_enc = gc.fit_transform(X_train, y_train) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) temp = pd.DataFrame([y_test, y_predprob_test[:, 1], y_pred_test]).T.sort_values(by=1, ascending=False) recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0], temp.iloc[:50, :][2], pos_label=1, average='binary')[1] recall_100 = precision_recall_fscore_support(temp.iloc[:25, :][0], temp.iloc[:25, :][2], pos_label=1, average='binary')[1] test_true_predict_compare.append([ test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0], y_predprob_test[:, 1] ]) #10-cv train_true_predict_compare.append([ train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0], y_predprob_train[:, 1] ]) #10-cv precision_training, recall_training, _ = precision_recall_curve( y_train, y_predprob_train[:, 1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve( y_test, y_predprob_test[:, 1], pos_label=1) AUPR_training, AUPR_testing = auc(recall_training, precision_training), auc( recall_testing, precision_testing) AUC_training, AUC_testing = roc_auc_score( y_train, y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:, 1]) metrics3_testing = precision_recall_fscore_support( y_test, y_pred_test, pos_label=1, average='binary')[:3] metrics3_training = precision_recall_fscore_support( y_train, y_pred_train, pos_label=1, average='binary')[:3] folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_metrics3_testing.append(metrics3_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) folds_metrics3_training.append(metrics3_training) folds_recall_50.append(recall_50) folds_recall_100.append(recall_100) Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0) Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0) return [ Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing, Avg_metrics3_training, folds_recall_50, folds_recall_100 ], [test_true_predict_compare, train_true_predict_compare]
def run_classification_configuration( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer, mode, seed): folds_AUC_testing, folds_AUPR_testing = [], [] folds_AUC_training, folds_AUPR_training = [], [] folds_metrics3_training, folds_metrics3_testing = [], [] test_true_predict_compare, train_true_predict_compare = [], [] folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], [] folds_G_mean = [] i = 0 for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip( X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold): config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer) gc = GCForest(config) X_train_enc = gc.fit_transform(X_train, y_train) y_pred_train = gc.predict(X_train) y_predprob_train = gc.predict_proba(X_train) y_pred_test = gc.predict(X_test) y_predprob_test = gc.predict_proba(X_test) temp = pd.DataFrame([y_test, y_predprob_test[:, 1], y_pred_test]).T.sort_values(by=1, ascending=False) recall_25 = precision_recall_fscore_support(temp.iloc[:25, :][0], temp.iloc[:25, :][2], pos_label=1, average='binary')[1] recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0], temp.iloc[:50, :][2], pos_label=1, average='binary')[1] test_true_predict_compare.append([ test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0], y_predprob_test[:, 1] ]) #10-cv train_true_predict_compare.append([ train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0], y_predprob_train[:, 1] ]) #10-cv precision_training, recall_training, _ = precision_recall_curve( y_train, y_predprob_train[:, 1], pos_label=1) precision_testing, recall_testing, _ = precision_recall_curve( y_test, y_predprob_test[:, 1], pos_label=1) AUPR_training, AUPR_testing = auc(recall_training, precision_training), auc( recall_testing, precision_testing) AUC_training, AUC_testing = roc_auc_score( y_train, y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:, 1]) metrics3_testing = precision_recall_fscore_support( y_test, y_pred_test, pos_label=1, average='binary')[:3] metrics3_training = precision_recall_fscore_support( y_train, y_pred_train, pos_label=1, average='binary')[:3] tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test, labels=[0, 1]).ravel() specificity = float(tn) / float(tn + fp) recall = metrics3_testing[1] G_mean = np.sqrt(recall * specificity) folds_AUC_testing.append(AUC_testing) folds_AUPR_testing.append(AUPR_testing) folds_metrics3_testing.append(metrics3_testing) folds_AUC_training.append(AUC_training) folds_AUPR_training.append(AUPR_training) folds_metrics3_training.append(metrics3_training) folds_G_mean.append(G_mean) folds_recall_25.append(recall_25) folds_recall_50.append(recall_50) i += 1 Avg_AUPR_training = np.mean(folds_AUPR_training) Avg_AUPR_testing = np.mean(folds_AUPR_testing) Avg_AUC_training = np.mean(folds_AUC_training) Avg_AUC_testing = np.mean(folds_AUC_testing) Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0) Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0) Avg_G_mean = np.mean(folds_G_mean) return [ Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, #012 Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, #345 folds_AUPR_training, folds_AUC_training, #67 Avg_metrics3_testing, Avg_metrics3_training, #89 folds_recall_25, folds_recall_50, folds_G_mean ], [test_true_predict_compare, train_true_predict_compare ] #folds_recall_100, folds_recall_200, folds_recall_400,
from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_curve, auc from dimension_reduction import elasticNet import utils.tools as utils from gcforest.gcforest import GCForest from gcforest.utils.config_utils import load_json start = time.time() path1 = 'gcforest4.json' config = load_json(path1) gc = GCForest(config) mask_data = sio.loadmat('yeast_elastic_mask_scale_0.03_0.1.mat') mask = mask_data.get('yeast_elastic_mask') extraction = sio.loadmat('yeast_feature_end.mat') proteinA = extraction.get('feature_A') protein_A = np.array(proteinA) proteinB = extraction.get('feature_B') protein_B = np.array(proteinB) X_ = np.concatenate((protein_A, protein_B), axis=1) X_ = np.array(X_) [row, column] = np.shape(X_) label_P = np.ones(int(row / 2)) label_N = np.zeros(int(row / 2)) label_ = np.hstack((label_P, label_N)) y_raw = np.mat(label_)
args = parser.parse_args() return args if __name__ == "__main__": # config args = parse_args() if args.model == 'ca': config = load_json('./mnist-ca.json') elif args.model == 'gc': config = load_json('./mnist-gc.json') else: config = load_json('./mnist-gc.json') gc = GCForest(config) # gc.set_keep_model_in_mem(False) gc.set_keep_model_in_mem(True) # data data_num_train = 60000 # The number of figures data_num_test = 10000 # test num fig_w = 45 # width of each figure X_train = np.fromfile("./data/mnist_train/mnist_train_data", dtype=np.uint8) y_train = np.fromfile("./data/mnist_train/mnist_train_label", dtype=np.uint8) X_test = np.fromfile("./data/mnist_test/mnist_test_data", dtype=np.uint8) y_test = np.fromfile("./data/mnist_test/mnist_test_label", dtype=np.uint8)