def gcf(X_train, X_test, y_train, y_test, cnames):

    clf = gcForest(shape_1X=[20, 20, 3],
                   n_mgsRFtree=80,
                   window=[18],
                   stride=1,
                   cascade_test_size=0.2,
                   n_cascadeRF=2,
                   n_cascadeRFtree=101,
                   min_samples_mgs=0.1,
                   min_samples_cascade=0.05,
                   tolerance=0.0,
                   n_jobs=3)

    train_start = time.clock()
    clf.fit(X_train, y_train)  # 模型训练
    train_end = time.clock()
    print('模型训练时间:', train_end - train_start)

    y_pred = clf.predict(X_test)  # 模型测试
    pre_end = time.clock()
    print('测试集结果:')
    print('测试运行时间 %.4f s' % (pre_end - train_end))
    print("accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("kappa:", metrics.cohen_kappa_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, target_names=cnames))

    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix,
                          classes=cnames,
                          normalize=False,
                          title="Normalized confusion matrix")
Beispiel #2
0
def gcf(X_train, X_test, y_train, y_test, cnames):

    clf = gcForest(shape_1X=(1, 18988), window=[1000, 2000], stride=10)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(y_pred)
Beispiel #3
0
 train_cellline = X_train.index
 print("train_cellline:", train_cellline)
 test_cellline = X_test.index
 print("test_cellline:", test_cellline)
 # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=5)
 # X_train.to_csv("X_train.csv")
 # y_train.to_csv("y_train.csv")
 # #
 # #
 levels = np.unique(np.array(y_train))
 print("levels:", levels)
 File = open(str(drug) + "expr.txt", "w")
 File.write("levels:" + str(levels) + "\n")
 clf = gcForest(shape_1X=(1, 400),
                window=[100, 200],
                stride=2,
                levels=levels,
                f=File)
 clf.fit(np.array(X_train), np.array(y_train))
 pred_proba = clf.predict_proba(X=np.array(X_test))
 pd.DataFrame(pred_proba).to_csv("test" + str(drug) +
                                 "_predict_proba.csv")
 # pd.DataFrame(pred_proba).to_csv(str(drug)+"_predict_proba.csv")
 predictions = clf.levels[np.argmax(pred_proba, axis=1)]
 pd.DataFrame(predictions).to_csv(str(drug) + "_predictions.csv")
 prediction_accuracy = accuracy_score(y_true=y_test, y_pred=predictions)
 print('Layer validation accuracy = {}'.format(prediction_accuracy))
 File.write(
     'Layer validation accuracy = {}'.format(prediction_accuracy) +
     "\n")
 File.close()
from GCForest import gcForest
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

#loading the iris data
iris = load_iris()
X = iris.data
Y = iris.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

gcf = gcForest(shape_1X=4, window=2, tolerance=0.0)
gcf.fit(X_train, Y_train)

#predict 方法预测的是每一条样本的分类类别,结果 pred_X 是一个 [0,1,2...]的 array
pred_X = gcf.predict(X_test)
accuracy = accuracy_score(y_true=Y_test,
                          y_pred=pred_X)  #用 test 数据的真实类别和预测类别算准确率
print('gcForest accuracy:{}'.format(accuracy))

#  predict_proba 方法预测的是每一条样本为 0,1,...类别的概率,结果是这样的:
# [[ 概率 1,概率 2,...],[ 概率 1,概率 2,...],...]的 DataFrame
# [:,1]表示取出序号为 1 的列,也就是预测类别为 1 的概率值,结果是一个数组
Y_predict_prod_test = gcf.predict_proba(X_test)[:, 1]
Y_predict_prod_train = gcf.predict_proba(X_train)[:, 1]
df.to_csv('../datasets/dataset_iloc/test_int.csv', header=False, index=False)

# Split into labels, names and data
y_tr = train['class']
names_train = train['name']
X_tr = train.drop(['class', 'name', 'sequence'], axis=1)

y_te = test['class']
names_test = test['name']
X_te = test.drop(['class', 'name', 'sequence'], axis=1)

# In[3]:

gcf = gcForest(n_cascadeRFtree=1000,
               n_mgsRFtree=1000,
               shape_1X=72,
               window=[5, 9, 18],
               min_samples_mgs=10,
               min_samples_cascade=7)
joblib.dump(gcf, 'gcf_model.sav')
#X_tr = X_tr.as_matrix()
#X_te = X_te.as_matrix()
gcf = joblib.load('gcf_model.sav')
std_scaler = StandardScaler().fit(X_tr)
# transform train and test set using standardization
X_tr = std_scaler.transform(X_tr)
X_te = std_scaler.transform(X_te)
y_tr = np.asarray(y_tr, dtype='int')
y_te = np.asarray(y_te, dtype='int')
gcf.fit(X_tr, y_tr)

pred_X = gcf.predict(X_te)
Beispiel #6
0
def gerar_gcForest(qtd_lags, window=2, tolerance=0.0 ):
	from GCForest import gcForest
	gcf = gcForest(shape_1X=qtd_lags, window=window, tolerance=tolerance)
	return gcf
Beispiel #7
0
from sklearn.model_selection import train_test_split

input_data = pd.read_csv('inputs/data_last.csv')
data = input_data.drop('hospital_expire_flag', axis=1)

# nt("222")
X_df = data.iloc[:, data.columns != 'Sepsis']
y_df = data.iloc[:, data.columns == 'Sepsis']
X_df1 = X_df.fillna(X_df.mean())
X = np.array(X_df1)
Y = np.array(y_df)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
gcf = gcForest(shape_1X=5, window=3, tolerance=0.001)
gcf.fit(X_train, Y_train)

#predict 方法预测的是每一条样本的分类类别,结果 pred_X 是一个 [0,1,2...]的 array
pred_X = gcf.predict(X_test)
accuracy = accuracy_score(y_true=Y_test,
                          y_pred=pred_X)  #用 test 数据的真实类别和预测类别算准确率
print('gcForest accuracy:{}'.format(accuracy))

#  predict_proba 方法预测的是每一条样本为 0,1,...类别的概率,结果是这样的:
# [[ 概率 1,概率 2,...],[ 概率 1,概率 2,...],...]的 DataFrame
# [:,1]表示取出序号为 1 的列,也就是预测类别为 1 的概率值,结果是一个数组
Y_predict_prod_test = gcf.predict_proba(X_test)[:, 1]
test_auc = metrics.roc_auc_score(Y_test, Y_predict_prod_test)  #验证集上的auc值

print("预测的AUC是: %s" % test_auc)
Beispiel #8
0
'''

######################### load packages #######################
import numpy as np
from keras.datasets import mnist
from sklearn.metrics import accuracy_score
from GCForest import gcForest

######################### load datasets #######################
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, y_train = X_train[:2000], y_train[:2000]

######################### reshape #######################
X_train = X_train.reshape((2000, 784))
X_test = X_test.reshape((10000, 784))

######################### build model and train #######################
gcf = gcForest(shape_1X=[28, 28],
               window=[7, 10, 14],
               tolerance=0.0,
               min_samples_mgs=10,
               min_samples_cascade=7)
gcf.fit(X_train, y_train)

######################### predict #######################
y_pred = gcf.predict(X_test)

######################### evaluating accuracy #######################
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print('gcForest accuracy : {}'.format(accuracy))
from sklearn.feature_selection import SelectKBest, SelectFromModel, SelectFdr, SelectFpr, SelectFwe, RFECV
from sklearn.base import clone
m10 = Pipeline([('select', SelectFromModel(clone(m1), 'mean', False)),
                ('predict', m1)])

from rgf.sklearn import RGFClassifier
m11 = RGFClassifier(max_leaf=1000,
                    algorithm="RGF_Sib",
                    test_interval=100,
                    learning_rate=0.1,
                    verbose=True)
#0.902

from GCForest import gcForest
m12 = gcForest(shape_1X=[1, 483], window=[483], tolerance=0.0)
#0.866

from sklearn.neural_network import MLPClassifier
m12 = Pipeline([('a', StandardScaler()), ('MLP', MLPClassifier())])
#0.8205

from sklearn.semi_supervised import LabelPropagation


class Semi(BaseEstimator):
    def fit(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
        return self