def gcf(X_train, X_test, y_train, y_test, cnames): clf = gcForest(shape_1X=[20, 20, 3], n_mgsRFtree=80, window=[18], stride=1, cascade_test_size=0.2, n_cascadeRF=2, n_cascadeRFtree=101, min_samples_mgs=0.1, min_samples_cascade=0.05, tolerance=0.0, n_jobs=3) train_start = time.clock() clf.fit(X_train, y_train) # 模型训练 train_end = time.clock() print('模型训练时间:', train_end - train_start) y_pred = clf.predict(X_test) # 模型测试 pre_end = time.clock() print('测试集结果:') print('测试运行时间 %.4f s' % (pre_end - train_end)) print("accuracy:", metrics.accuracy_score(y_test, y_pred)) print("kappa:", metrics.cohen_kappa_score(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred, target_names=cnames)) cnf_matrix = metrics.confusion_matrix(y_test, y_pred) print(cnf_matrix) plot_confusion_matrix(cnf_matrix, classes=cnames, normalize=False, title="Normalized confusion matrix")
def gcf(X_train, X_test, y_train, y_test, cnames): clf = gcForest(shape_1X=(1, 18988), window=[1000, 2000], stride=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(y_pred)
train_cellline = X_train.index print("train_cellline:", train_cellline) test_cellline = X_test.index print("test_cellline:", test_cellline) # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=5) # X_train.to_csv("X_train.csv") # y_train.to_csv("y_train.csv") # # # # levels = np.unique(np.array(y_train)) print("levels:", levels) File = open(str(drug) + "expr.txt", "w") File.write("levels:" + str(levels) + "\n") clf = gcForest(shape_1X=(1, 400), window=[100, 200], stride=2, levels=levels, f=File) clf.fit(np.array(X_train), np.array(y_train)) pred_proba = clf.predict_proba(X=np.array(X_test)) pd.DataFrame(pred_proba).to_csv("test" + str(drug) + "_predict_proba.csv") # pd.DataFrame(pred_proba).to_csv(str(drug)+"_predict_proba.csv") predictions = clf.levels[np.argmax(pred_proba, axis=1)] pd.DataFrame(predictions).to_csv(str(drug) + "_predictions.csv") prediction_accuracy = accuracy_score(y_true=y_test, y_pred=predictions) print('Layer validation accuracy = {}'.format(prediction_accuracy)) File.write( 'Layer validation accuracy = {}'.format(prediction_accuracy) + "\n") File.close()
from GCForest import gcForest from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd #loading the iris data iris = load_iris() X = iris.data Y = iris.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33) gcf = gcForest(shape_1X=4, window=2, tolerance=0.0) gcf.fit(X_train, Y_train) #predict 方法预测的是每一条样本的分类类别,结果 pred_X 是一个 [0,1,2...]的 array pred_X = gcf.predict(X_test) accuracy = accuracy_score(y_true=Y_test, y_pred=pred_X) #用 test 数据的真实类别和预测类别算准确率 print('gcForest accuracy:{}'.format(accuracy)) # predict_proba 方法预测的是每一条样本为 0,1,...类别的概率,结果是这样的: # [[ 概率 1,概率 2,...],[ 概率 1,概率 2,...],...]的 DataFrame # [:,1]表示取出序号为 1 的列,也就是预测类别为 1 的概率值,结果是一个数组 Y_predict_prod_test = gcf.predict_proba(X_test)[:, 1] Y_predict_prod_train = gcf.predict_proba(X_train)[:, 1]
df.to_csv('../datasets/dataset_iloc/test_int.csv', header=False, index=False) # Split into labels, names and data y_tr = train['class'] names_train = train['name'] X_tr = train.drop(['class', 'name', 'sequence'], axis=1) y_te = test['class'] names_test = test['name'] X_te = test.drop(['class', 'name', 'sequence'], axis=1) # In[3]: gcf = gcForest(n_cascadeRFtree=1000, n_mgsRFtree=1000, shape_1X=72, window=[5, 9, 18], min_samples_mgs=10, min_samples_cascade=7) joblib.dump(gcf, 'gcf_model.sav') #X_tr = X_tr.as_matrix() #X_te = X_te.as_matrix() gcf = joblib.load('gcf_model.sav') std_scaler = StandardScaler().fit(X_tr) # transform train and test set using standardization X_tr = std_scaler.transform(X_tr) X_te = std_scaler.transform(X_te) y_tr = np.asarray(y_tr, dtype='int') y_te = np.asarray(y_te, dtype='int') gcf.fit(X_tr, y_tr) pred_X = gcf.predict(X_te)
def gerar_gcForest(qtd_lags, window=2, tolerance=0.0 ): from GCForest import gcForest gcf = gcForest(shape_1X=qtd_lags, window=window, tolerance=tolerance) return gcf
from sklearn.model_selection import train_test_split input_data = pd.read_csv('inputs/data_last.csv') data = input_data.drop('hospital_expire_flag', axis=1) # nt("222") X_df = data.iloc[:, data.columns != 'Sepsis'] y_df = data.iloc[:, data.columns == 'Sepsis'] X_df1 = X_df.fillna(X_df.mean()) X = np.array(X_df1) Y = np.array(y_df) scaler = StandardScaler() X = scaler.fit_transform(X) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) gcf = gcForest(shape_1X=5, window=3, tolerance=0.001) gcf.fit(X_train, Y_train) #predict 方法预测的是每一条样本的分类类别,结果 pred_X 是一个 [0,1,2...]的 array pred_X = gcf.predict(X_test) accuracy = accuracy_score(y_true=Y_test, y_pred=pred_X) #用 test 数据的真实类别和预测类别算准确率 print('gcForest accuracy:{}'.format(accuracy)) # predict_proba 方法预测的是每一条样本为 0,1,...类别的概率,结果是这样的: # [[ 概率 1,概率 2,...],[ 概率 1,概率 2,...],...]的 DataFrame # [:,1]表示取出序号为 1 的列,也就是预测类别为 1 的概率值,结果是一个数组 Y_predict_prod_test = gcf.predict_proba(X_test)[:, 1] test_auc = metrics.roc_auc_score(Y_test, Y_predict_prod_test) #验证集上的auc值 print("预测的AUC是: %s" % test_auc)
''' ######################### load packages ####################### import numpy as np from keras.datasets import mnist from sklearn.metrics import accuracy_score from GCForest import gcForest ######################### load datasets ####################### (X_train, y_train), (X_test, y_test) = mnist.load_data() X_train, y_train = X_train[:2000], y_train[:2000] ######################### reshape ####################### X_train = X_train.reshape((2000, 784)) X_test = X_test.reshape((10000, 784)) ######################### build model and train ####################### gcf = gcForest(shape_1X=[28, 28], window=[7, 10, 14], tolerance=0.0, min_samples_mgs=10, min_samples_cascade=7) gcf.fit(X_train, y_train) ######################### predict ####################### y_pred = gcf.predict(X_test) ######################### evaluating accuracy ####################### accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print('gcForest accuracy : {}'.format(accuracy))
from sklearn.feature_selection import SelectKBest, SelectFromModel, SelectFdr, SelectFpr, SelectFwe, RFECV from sklearn.base import clone m10 = Pipeline([('select', SelectFromModel(clone(m1), 'mean', False)), ('predict', m1)]) from rgf.sklearn import RGFClassifier m11 = RGFClassifier(max_leaf=1000, algorithm="RGF_Sib", test_interval=100, learning_rate=0.1, verbose=True) #0.902 from GCForest import gcForest m12 = gcForest(shape_1X=[1, 483], window=[483], tolerance=0.0) #0.866 from sklearn.neural_network import MLPClassifier m12 = Pipeline([('a', StandardScaler()), ('MLP', MLPClassifier())]) #0.8205 from sklearn.semi_supervised import LabelPropagation class Semi(BaseEstimator): def fit(self, train_x, train_y): self.train_x = train_x self.train_y = train_y return self