Esempio n. 1
0
def df(x_train, y_train, n_features):
    config = load_json("demo_ca.json")
    gc = GCForest(config)
    X_train = x_train.values.reshape(-1, 1, len(x_train.columns))

    _, _features = gc.fit_transform(X_train, y_train)
    _features = _features.sort_values(ascending=False)
    return _features.index.values.tolist()[:n_features]
Esempio n. 2
0
def RUN_2(best_th):  #主函数,在获得最优分类阈值的情况下计算模型在测试集上的预测结果
    comm_s_TPR = []
    comm_s_TNR = []
    comm_s_BER = []
    comm_s_ACC = []
    comm_s_MCC = []
    comm_s_F1score = []
    comm_s_AUC = []
    comm_s_time = []
    #将原始数据分为训练集,测试集
    tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split(
        comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1],
        comtest.iloc[0:len(comtest), -1],
        test_size=0.2,
        random_state=0)

    x_train = tiaocan_train
    y_train = tiaocan_train_test
    x_test = ceshi_train
    y_true = ceshi_true

    x_train = np.array(x_train, dtype=np.float16)
    y_train = np.array(y_train, dtype=np.float16)
    x_test = np.array(x_test, dtype=np.float16)
    y_true = np.array(y_true, dtype=np.float16)
    #    x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train)  #对训练集使用欠采样的方法达到类平衡

    # 设置机器学习模型
    ##########################################################################################################################

    ############################## --XGB-- #############################
    comm = GCForest(config)
    comm.fit_transform(x_train, y_train)
    pro_comm_Pre = comm.predict_proba(x_test)
    blo_comm_Pre = blo(pro_comm_Pre, best_th)  #根据最优分类阈值与预测概率计算画着生死情况
    eva_comm = evaluating_indicator(y_true=y_true,
                                    y_test=blo_comm_Pre,
                                    y_test_value=pro_comm_Pre)

    comm_s_TPR.append(eva_comm['TPR'])
    comm_s_TNR.append(eva_comm['TNR'])
    comm_s_BER.append(eva_comm['BER'])
    comm_s_ACC.append(eva_comm['ACC'])
    comm_s_MCC.append(eva_comm['MCC'])
    comm_s_F1score.append(eva_comm['F1_score'])
    comm_s_AUC.append(eva_comm['AUC'])
    eva_comm = {
        "TPR": np.mean(comm_s_TPR),
        "TNR": np.mean(comm_s_TNR),
        "BER": np.mean(comm_s_BER),
        "ACC": np.mean(comm_s_ACC),
        "MCC": np.mean(comm_s_MCC),
        "F1_score": np.mean(comm_s_F1score),
        "AUC": np.mean(comm_s_AUC),
        "time": np.mean(comm_s_time)
    }

    return eva_comm
Esempio n. 3
0
def get_m_gcForest(mtype="ca"):
    """
    @param mtype: "ca" or "gc"
    @param n_esti: n_estimators param in get_ca_config
    """
    if mtype == "ca":
        config = get_ca_config()
        gc = GCForest(config)
        return gc
    if mtype == "gc":
        config = get_gc_config()
        gc = GCForest(config)
        return gc
Esempio n. 4
0
def run_model(features, adhd_labels, rand_params, verbose=True, test_size=0.2):
    """
    Run the gcForest using parameters from the Optimizer. Use random portions of the original dataset
    for testing and training (default 20%-80%)

    :param kind: (str) The type of functional connectivity we want to use
    :param features: (list) A matrix containing phenotypic and functional connectivity c
    :param adhd_labels: (list) The correct labels from the dataset
    :param rand_params: (dict) The generated random params from the Optimizer
    :param verbose: (bool) Whether to print classification report
    :param test_size: (float) How much of the dataset to use for testing
    :return: (float) accuracy, (float) f1, (float) precision, (float) recall
    """
    classifier = GCForest(  # Instantiate the gcForest algorithm using the random parameters we generated
        config=generate_gcforest_config(rand_params['mlp_layers'], rand_params['mlp_solver'],
                                        rand_params['logistic_regressions'],
                                        rand_params['svc_kernel'], rand_params['xgb_estimators'],
                                        rand_params['rf_estimators'],
                                        rand_params['early_stopping_iterations'], rand_params['positions']),
    )

    X_train, X_test, y_train, y_test = train_test_split(features, adhd_labels, test_size=test_size)
    # Split the data into random subsets (20% test, 80% train by default)
    classifier.fit_transform(np.array(X_train), np.array(y_train))  # Train the gcForest model
    y_pred = classifier.predict(np.array(X_test))  # Predict off of the test dataset
    y_test = np.array(y_test)
    if verbose:
        print "Classification Report\n", classification_report(y_test, y_pred)  # Print out some useful run information
        print "Accuracy:", accuracy_score(y_test, y_pred)
        print "Confusion Matrix\n", confusion_matrix(y_test, y_pred)
    positive_metrics = {
        'f1': f1_score(y_test, y_pred),  # Calculate the f1 for class "1"
        'precision': precision_score(y_test, y_pred),  # Calculate the precision for class "1"
        'recall': recall_score(y_test, y_pred),  # Calculate the recall for class "1"
    }
    negative_metrics = {
        'f1': f1_score(y_test, y_pred, pos_label=0),  # Calculate the f1 for class "0"
        'precision': precision_score(y_test, y_pred, pos_label=0),  # Calculate the precision for class "0"
        'recall': recall_score(y_test, y_pred, pos_label=0),  # Calculate the recall for class "0"
    }
    matrix = confusion_matrix(y_test, y_pred)
    confusion = {  # Return the attributes of the confusion matrix
        'true_negative': matrix[0][0],  # Predicted false and is false
        'false_positive': matrix[0][1],  # Predicted false and is true
        'false_negative': matrix[1][0],  # Predicted true and is false
        'true_positive': matrix[1][1]  # Predicted true and is true
    }
    scores = accuracy_score(y_test, y_pred), positive_metrics, negative_metrics, confusion
    # Get the accuracy, f1, precision and recall of the model

    return scores  # Return it
Esempio n. 5
0
def RUN():  #根据训练集与验证集获取最优分类阈值
    tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split(
        comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1],
        comtest.iloc[0:len(comtest), -1],
        test_size=0.2,
        random_state=0)
    position = []
    skf = StratifiedKFold(n_splits=10)  #设置十折交叉验证
    tiaocan_train = np.array(tiaocan_train, dtype=np.float16)
    tiaocan_train_test = np.array(tiaocan_train_test, dtype=np.float16)
    times = 0
    position = []
    for train, test in skf.split(tiaocan_train, tiaocan_train_test):
        alltime_start = time.time()

        times = times + 1

        x_train = tiaocan_train[train]
        y_train = tiaocan_train_test[train]
        x_test = tiaocan_train[test]
        y_true = tiaocan_train_test[test]
        #        x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train)   #使用欠采样的方法进行类平衡

        # 设置机器学习模型

        #
        ############################## --XGB-- #############################

        comm = GCForest(config)
        comm.fit_transform(x_train, y_train)  #模型训练
        pro_comm_Pre = comm.predict_proba(x_test)

        ############################### 敏感性特异性相近 ########################################
        RightIndex = []
        for jj in range(100):  #计算模型在不同分类阈值下的各项指标
            blo_comm_Pre = blo(pro_comm_Pre, jj)
            eva_comm = evaluating_indicator(y_true=y_true,
                                            y_test=blo_comm_Pre,
                                            y_test_value=pro_comm_Pre)
            RightIndex.append(abs(eva_comm['TPR'] - eva_comm['TNR']))
        RightIndex = np.array(RightIndex, dtype=np.float16)
        position = np.argmin(RightIndex)  #选择出使得敏感性特异性最小的阈值作为分类阈值输出
        alltime_end = time.time()
        print('done_0, 第%s次验证 , time: %s  s ' %
              (times, alltime_end - alltime_start))


######################################################################################
    return position.mean()  #计算交叉验证输出的多个阈值的平均值作为最优分类阈值
Esempio n. 6
0
def cross_validation(X, y, k, cpu):
    config = get_toy_config(cpu = cpu)
    classifier = GCForest(config)
    cv = StratifiedKFold(n_splits = k)
    res = {}
    i=1
    for train, test in cv.split(X, y):
        tt = classifier.fit_transform(X[train], y[train])
        yscore = classifier.predict_proba(X[test])
        tmpID = "fold_" + str(i)
        curDic = {}
        curDic["yscore"] = yscore
        curDic["ytest"] = y[test]
        res[tmpID] = curDic
        i = i + 1    
    return res
Esempio n. 7
0
def run_gcforest(train_X,
                 test_X,
                 train_y,
                 test_y,
                 rounds=3,
                 layers=100,
                 seed=0):
    config = get_toy_config(rounds, layers, seed)
    gc = GCForest(config)  # should be a dict
    X_train_enc = gc.fit_transform(train_X, train_y)
    ypred = np.array([i[1] for i in gc.predict_proba(test_X)])
    metrics = gen_eval_metrics(test_y, ypred)
    accuracy = metrics[0]

    #cor = sum([int(ypred[i] + 0.5) == test_y[i] for i in range(len(ypred))])
    #accuracy = cor / len(test_y)
    print('Fold accuracy: ' + str(accuracy))
    return metrics
Esempio n. 8
0
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001):
    
    
    i = 0
    folds_AUC_testing = []
    folds_AUPR_testing = []
    folds_AUC_training = []
    folds_AUPR_training = []
    test_true_predict_compare = []
    train_true_predict_compare = []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold):
#         X_train, X_test = X_train[:,12:], X_test[:,12:]
#         X_train, X_test = X_train[:,:12], X_test[:,:12]
        
        config = get_toy_config(trees,max_depth, min_child_weight,cw,layer)
        gc = GCForest(config)
        #print(config)
        X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)
        y_predprob_test_df = pd.DataFrame(y_predprob_test)
        y_predprob_train_df = pd.DataFrame(y_predprob_train)
        test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv
        train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv
        
        precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1)
        precision_testing, recall_testing, _ =   precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1)    
        AUPR_training = auc(recall_training,precision_training)
        AUPR_testing = auc(recall_testing, precision_testing)
        AUC_training = roc_auc_score(y_train, y_predprob_train[:,1]) 
        AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1]) 

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)

    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing) 
    
    return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]
    def fit(self, xtrain: pd.DataFrame, ytrain: pd.Series):
        """
        Fit model

        :param xtrain: training features
        :param ytrain: training labels
        """
        clf = GCForest(self.config)

        if self.scaler is None:
            clf.fit_transform(xtrain, ytrain)
        else:
            xtrain_norm = self.scaler.fit_transform(xtrain)
            clf.fit_transform(xtrain_norm, ytrain)

        return DeepRandomForestModel(clf, self.scaler)
# -*- coding:utf-8 -*-
import pandas as pd
from utils import avg_importance

from sklearn.model_selection import StratifiedKFold

import gcforest.data_load as load
from gcforest.gcforest import GCForest

import utils

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

config = utils.load_json("demo_ca.json")
gc = GCForest(config)

datasets = ['cirrhosis', 'obesity', 't2d']

for dataset_idx, name in enumerate(datasets):
    thre_features = {}
    X = None
    Y = None
    if name == 'cirrhosis':
        X, Y = load.cirrhosis_data()
    elif name == 't2d':
        X, Y = load.t2d_data()
    elif name == 'obesity':
        X, Y = load.obesity_data()
    else:
        raise Exception('the dataset is not defined!!!')
Esempio n. 11
0
    loocv = LeaveOneOut()

    clf_rf = RandomForestClassifier(n_estimators=50, random_state=0)

    clf_svm = SVC(kernel='linear',
                  C=1,
                  gamma=0.001,
                  random_state=0,
                  probability=True)

    # xgb_crf =  XGBClassifier(n_estimators=50)

    # config = gcforest_config()
    config = load_json("gc.json")
    clf_gc = GCForest(config)
    gc_pred_acc = []

    # # ==============================================
    f, ax = plt.subplots(1, 1)
    params = [(clf_rf, 'green', "Random Forest"), (clf_svm, 'black', "SVM"),
              (clf_gc, 'red', "Deep Forest")]
    # (xgb_crf,'purple', "XGBoosing")]
    # params = [(clf_gc,'red',"Deep Forest")]
    for x in params:
        mean_fpr = np.linspace(0, 1, 100)
        tprs = []
        aucs = []
        i = 1
        for train, test in loocv.split(X, Y):
            probas_ = None
Esempio n. 12
0
# In[12]:

config = get_toy_config()

models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=random_seed),
    ExtraTreesClassifier(random_state=random_seed),
    GCForest(config)
]

# In[16]:

test_entries = []
train_entries = []

for model in models:
    model_name = model.__class__.__name__
    if model_name == 'GCForest':
        model.fit_transform(X_train, y_train, X_test, y_test)
    else:
        model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
X_train_oversampled_batch, y_train_oversampled_batch = Batch(
    X_train_oversampled, y_train_oversampled, batch_size)
X_train_batch, y_train_batch = Batch(X_train, y_train, batch_size)
X_valid_batch, y_valid_batch = Batch(X_valid, y_valid, batch_size)

# # GcForest
#
# ## train gc

# #### 1.train GcForest on oversampled datasets

# In[21]:

config = get_toy_config()
gc = GCForest(config)

X_train_enc = gc.fit_transform(X_train_oversampled, y_train_oversampled)

# dump
with open("../pkl/2018_test.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
# load
with open("../pkl/2018_test.pkl", "rb") as f:
    gc = pickle.load(f)

# #### test GcForest on valid datasets

# In[22]:

y_valid_pred = gc.predict(X_valid)
Esempio n. 14
0
            "type": "LogisticRegression"
        })
    else:
        ca_config["estimators"].append({
            "n_folds": 2,
            "type": "ExtraTreesClassifier",
            "n_estimators": 10,
            "max_depth": None,
            "n_jobs": -1
        })
    config["cascade"] = ca_config
    return config


config = get_toy_config(all_estimators=all_estimators)
gc = GCForest(config)
# If the model you use cost too much memory for you.
# You can use these methods to force gcforest not keeping model in memory
# gc.set_keep_model_in_mem(False), default is TRUE.
n_test = 500
# (X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, y_train = train_dataset_x[:-n_test], train_dataset_y[:-n_test]
X_test_cv, y_test_cv = train_dataset_x[-n_test:], train_dataset_y[-n_test:]

X_train = X_train[:, np.newaxis, :, :]
X_test_cv = X_test_cv[:, np.newaxis, :, :]

X_train_enc = gc.fit_transform(X_train, y_train)

y_pred_cv = gc.predict(X_test_cv)
acc = accuracy_score(y_test_cv, y_pred_cv)
Esempio n. 15
0
feature = np.vstack((positive_feature, negative_sample_feature))
label1 = np.ones((len(positive_feature), 1))
label0 = np.zeros((len(negative_sample_feature), 1))
label = np.vstack((label1, label0))

rs = np.random.randint(0, 1000, 1)[0]
kf = StratifiedKFold(label[:, 0], n_folds=5, shuffle=True, random_state=rs)

test_auc_fold = []
test_aupr_fold = []
for train_index, test_index in kf:
    Xtrain, Xtest = feature[train_index], feature[test_index]
    Ytrain, Ytest = label[train_index], label[test_index]

    config = get_toy_config()
    rf = GCForest(config)
    Ytrain = Ytrain.flatten()
    rf.fit_transform(Xtrain, Ytrain)

    # deep forest
    predict_y = rf.predict(Xtest)
    acc = accuracy_score(Ytest, predict_y)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
    prob_predict_y = rf.predict_proba(
        Xtest
    )  # Give a result with probability values,the probability sum is 1
    predictions_validation = prob_predict_y[:, 1]
    fpr, tpr, _ = roc_curve(Ytest, predictions_validation)
    roc_auc = auc(fpr, tpr)
    aupr = average_precision_score(Ytest, predictions_validation)
    print(roc_auc)
    ca_config["estimators"].append({
        "n_folds": 5,
        "type": "LogisticRegression"
    })
    config["cascade"] = ca_config
    return config


if __name__ == "__main__":
    args = parse_args()
    if args.model is None:
        config = get_toy_config()
    else:
        config = load_json(args.model)

    gc = GCForest(config)
    # If the model you use cost too much memory for you.
    # You can use these methods to force gcforest not keeping model in memory
    gc.set_keep_model_in_mem(False)  # default is TRUE.

    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    # X_train, y_train = X_train[:2000], y_train[:2000]
    X_train = X_train[:, np.newaxis, :, :]
    X_test = X_test[:, np.newaxis, :, :]

    X_train_enc = gc.fit_transform(X_train, y_train)
    # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
    # X_enc.shape =
    #   (n_datas, n_estimators * n_classes): If cascade is provided
    #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
    # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
Esempio n. 17
0
def GAGCForest_prediction(feature_data, result_data):
    n_splits = 5
    acc_scores = np.zeros(n_splits)
    recall_scores = np.zeros(n_splits)
    mcc_scores = np.zeros(n_splits)
    f1_scores = np.zeros(n_splits)
    skfolds = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=random_state).split(
                                  feature_data, result_data)
    new_test_pred = np.zeros(feature_data.shape[0])
    new_test_proba = np.zeros(feature_data.shape[0])
    for j, (train_idx, test_idx) in enumerate(skfolds):
        X_train = feature_data[train_idx]
        Y_train = result_data[train_idx]
        X_test = feature_data[test_idx]
        Y_test = result_data[test_idx]
        config = get_toy_config()
        gc = GCForest(config)  # should be a dict
        X_train_enc = gc.fit_transform(X_train, Y_train)
        y_pred = gc.predict(X_test)
        X_test_enc = gc.transform(X_test)
        # 获取函数接口地址
        AIM_M = __import__('aimfuc')
        AIM_F = 'gcforestCM'
        """============================变量设置============================"""
        w1 = [0, 1]
        w2 = [0, 1]
        w3 = [0, 1]
        b1 = [1, 1]
        b2 = [1, 1]
        b3 = [1, 1]
        ranges = np.vstack([w1, w2, w3]).T  # 生成自变量的范围矩阵
        borders = np.vstack([b1, b2, b3]).T  # 生成自变量的边界矩阵
        # ranges = np.vstack([np.zeros((1, 3)), np.ones((1, 3))])  # 生成自变量的范围矩阵
        # print(shape(ranges))
        # borders = np.vstack([.ones((1, 3)), np.ones((1, 3))])  # 生成自变量的边界矩阵
        precisions = [6] * 3  # 自变量的编码精度
        scales = [0] * 3
        codes = [1] * 3
        # print(np.ones((1, 300)))
        # scales = list(np.zeros((1, 300)))  # 采用算术刻度
        # codes = np.vstack([np.ones((1, 300)), np.ones((1, 300))])  # 变量的编码方式,2个变量均使用格雷编码
        # print(shape(codes))
        """========================遗传算法参数设置========================="""
        # NIND = 50  # 种群规模
        # MAXGEN = 100  # 最大遗传代数
        # GGAP = 0.8  # 代沟:子代与父代个体不相同的概率为0.8
        # selectStyle = 'sus';  # 遗传算法的选择方式设为"sus"——随机抽样选择
        # recombinStyle = 'xovdp'  # 遗传算法的重组方式,设为两点交叉
        # recopt = 0.9  # 交叉概率
        # pm = 0.1  # 变异概率
        # SUBPOP = 1  # 设置种群数为1
        # maxormin = 1  #
        # 设置最大最小化目标标记为1,表示是最小化目标,-1则表示最大化目标

        FieldD = ga.crtfld(ranges, borders, precisions, codes, scales)  #

        # 调用编程模板
        [weightarray, pop_trace, var_trace,
         times] = new_code_templet(AIM_M,
                                   AIM_F,
                                   None,
                                   None,
                                   FieldD,
                                   problem='R',
                                   maxormin=-1,
                                   MAXGEN=10,
                                   NIND=50,
                                   SUBPOP=1,
                                   GGAP=0.8,
                                   selectStyle='sus',
                                   recombinStyle='xovsp',
                                   recopt=0.9,
                                   pm=0.7,
                                   distribute=True,
                                   proba=X_train_enc,
                                   result=Y_train,
                                   drawing=0)
        print('用时:', times, '秒')
        # w3 = 1 - weight[0] - weight[1]
        # print(weight)

        # weightarray = np.concatenate((weight, [w3]), axis=0)
        for element in weightarray:
            print(element)
        test_probaF = X_test_enc[:, ::2].T
        test_probaT = X_test_enc[:, 1::2].T
        test_predT = np.dot(weightarray, test_probaT)
        test_predF = np.dot(weightarray, test_probaF)
        test_pred = np.zeros(len(test_predT))
        test_proba = np.zeros(len(test_predT))
        for i in range(len(test_predT)):
            temper = test_predT[i] + test_predF[i]
            test_proba = test_predT / temper
            if (test_predT[i] > test_predF[i]):
                test_pred[i] = 1
            else:
                test_pred[i] = 0
        confmat = confusion_matrix(Y_test, test_pred)
        sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
        sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
        print('1. The acc score of the model {}\n'.format(
            accuracy_score(Y_test, test_pred)))
        print('2. The sp score of the model {}\n'.format(sp))
        print('3. The sn score of the model {}\n'.format(sn))
        print('4. The mcc score of the model {}\n'.format(
            matthews_corrcoef(Y_test, test_pred)))

        print('9. The auc score of the model {}\n'.format(
            roc_auc_score(Y_test, test_proba, average='macro')))
        print('6. The recall score of the model {}\n'.format(
            recall_score(Y_test, test_pred, average='macro')))
        print('5. The F-1 score of the model {}\n'.format(
            f1_score(Y_test, test_pred, average='macro')))
        print('7. Classification report \n {} \n'.format(
            classification_report(Y_test, test_pred)))
        print('8. Confusion matrix \n {} \n'.format(
            confusion_matrix(Y_test, test_pred)))

        recall = recall_score(Y_test, test_pred, average='macro')
        f1 = f1_score(Y_test, test_pred, average='macro')
        acc = accuracy_score(Y_test, test_pred)
        mcc = matthews_corrcoef(Y_test, test_pred)
        recall_scores[j] = recall
        f1_scores[j] = f1
        acc_scores[j] = acc
        mcc_scores[j] = mcc
        new_test_pred[test_idx] = test_pred
        new_test_proba[test_idx] = test_proba
        print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}".
              format(j, recall, acc, mcc, f1))
    new_confmat = confusion_matrix(result_data, new_test_pred)
    sn = new_confmat[1, 1] / (new_confmat[1, 0] + new_confmat[1, 1])
    sp = new_confmat[0, 0] / (new_confmat[0, 0] + new_confmat[0, 1])
    print(
        "---------------------------------遗传算法-----------------------------------------"
    )
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, new_test_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, new_test_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, new_test_proba, average='macro')))
    print('6. The recall score of the model {}\n'.format(
        recall_score(result_data, new_test_pred, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, new_test_pred, average='macro')))
    print('7. Classification report \n {} \n'.format(
        classification_report(result_data, new_test_pred)))
    print('8. Confusion matrix \n {} \n'.format(
        confusion_matrix(result_data, new_test_pred)))
Esempio n. 18
0
def load_json(path):
    import json
    """
    支持以//开头的注释
    """
    lines = []
    with open(path) as f:
        for row in f.readlines():
            if row.strip().startswith("//"):
                continue
            lines.append(row)
    return json.loads("\n".join(lines))

X, Y = load.obesity_data()

x_tr,x_te,y_tr,y_te = train_test_split(X,Y,random_state=42,stratify=Y)

clf_rf = RandomForestClassifier(n_estimators=200, random_state=0)
clf_rf.fit(x_tr,y_tr)
y_pred = clf_rf.predict(x_te)
print(accuracy_score(y_te,y_pred))


config = load_json("/home/qiang/repo/python/cascade_clf/examples/demo_ca.json")
clf_gc = GCForest(config)

clf_gc.fit_transform(x_tr.values, y_tr)
y_pred = clf_gc.predict(x_te.values)
print(accuracy_score(y_te, y_pred))
# load gc config

# In[12]:


y_train2 = y_train2.values
y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values


# In[90]:


config = get_toy_config()
model = GCForest(config)

model.fit_transform(X_train2, y_train2, X_test, y_test)
gc_valid_proba = model.predict_proba(X_valid)
gc_pred = model.predict(X_valid)


# In[14]:


models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
Esempio n. 20
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale, StandardScaler
from sklearn.metrics import roc_curve, auc
from dimension_reduction import KPCA, LLE, pca
import utils.tools as utils

from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json

start = time.time()
path1 = 'gcforest4.json'

config = load_json(path1)
gc = GCForest(config)
extraction = sio.loadmat('yeast_feature_end.mat')
proteinA = extraction.get('feature_A')
protein_A = np.array(proteinA)
proteinB = extraction.get('feature_B')
protein_B = np.array(proteinB)
X_ = np.concatenate((protein_A, protein_B), axis=1)
X_ = np.array(X_)
[row, column] = np.shape(X_)
label_P = np.ones(int(row / 2))
label_N = np.zeros(int(row / 2))
label_ = np.hstack((label_P, label_N))
y_raw = np.mat(label_)
y_raw = np.transpose(y_raw)
y_ = np.array(y_raw)
Esempio n. 21
0
    # ca_config["estimators"].append(
    #     {"n_folds": 3, "type": "RandomForestClassifier", "n_estimators": 10, "n_jobs": -1,"random_state":0})
    # ca_config["estimators"].append(
    #         {"n_folds": 3, "type": "XGBClassifier", "n_estimators": 10,
    #           "silent": True, "nthread": -1, "learning_rate": 0.1} )
    # ca_config["estimators"].append({"n_folds": 3, "type": "ExtraTreesClassifier","max_depth": None, "n_jobs": -1})
    # ca_config["estimators"].append({"n_folds": 3, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config


if __name__ == "__main__":
    X, Y = load2.cirrhosis_data()

    config = gcforest_config()
    gc = GCForest(config)

    AUCs = []
    for i in range(10):
        cv = StratifiedKFold(n_splits=10, shuffle=True)
        # # ==============================================
        mean_fpr = np.linspace(0, 1, 100)
        tprs = []
        aucs = []
        for train, test in cv.split(X, Y):
            x_train = X.iloc[train]
            y_train = Y[train]

            x_test = X.iloc[test]
            y_test = Y[test]
# test = pd.read_csv("../data/water/csv/test2018.csv")

# X_test = test.values[:, 0:-1]
# y_test = test.values[:, -1]

# X_test = clean_pipeline.fit_transform(X_test)


# In[13]:

# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, stratify = y, random_state = random_seed)

# X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
config = get_toy_config()
gc = GCForest(config)

gc.fit_transform(X_train_oversampled, y_train_oversampled, X_valid, y_valid)
# y_valid_pred = gc.predict(X_valid)


# In[13]:

# dump
with open("../pkl/2019_gc.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
    
# # load
# with open("../pkl/2018_gc.pkl", "rb") as f:
#     gc = pickle.load(f)
Esempio n. 23
0
            x_u_s = np.concatenate((x_p_test, x_u), axis=0)
            y_u_s = np.concatenate((y_p_test, y_u), axis=0)
            y_u_s = np.zeros(y_u_s.shape[0])

            x = np.concatenate((x_p_s, x_u_s), axis=0)
            y = np.concatenate((y_p_s, y_u_s), axis=0)

            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=1)
            # scaler = StandardScaler().fit(X_train)
            # X_train_transformed = scaler.transform(X_train)
            # X_test_transformed = scaler.transform(X_test)
            config = get_toy_config()
            gc = GCForest(config)
            gc.fit_transform(x_train, y_train)

            scores = gc.predict_proba(x_u_test)[:, 0]
            orderScores = np.argsort(-scores)
            orderList = [str(item) for item in orderScores]
            orderStr = ','.join(orderList)
            top = int(y_u_test.shape[0] * 0.25)
            topNIndex = orderScores[:top]
            t = 0
            while t < top:
                index = topNIndex[t]
                x_n = x_u[index]
                X_n = np.vstack((X_n, x_n))
                t += 1
        X_n = X_n[1:, :]
Esempio n. 24
0
    ca_config["estimators"].append({
        "n_folds": 5,
        "type": "LogisticRegression"
    })
    config["cascade"] = ca_config
    return config


if __name__ == "__main__":
    args = parse_args()
    if args.model is None:
        config = get_toy_config()
    else:
        config = load_json(args.model)

    gc = GCForest(config)
    # If the model you use cost too much memory for you.
    # You can use these methods to force gcforest not keeping model in memory
    # gc.set_keep_model_in_mem(False), default is TRUE.

    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    # X_train, y_train = X_train[:2000], y_train[:2000]
    X_train = X_train[:, np.newaxis, :, :]
    X_test = X_test[:, np.newaxis, :, :]

    X_train_enc = gc.fit_transform(X_train, y_train)
    # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
    # X_enc.shape =
    #   (n_datas, n_estimators * n_classes): If cascade is provided
    #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
    # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
Esempio n. 25
0
def GCForest_prediction(feature_data, result_data):
    random_state = 2019
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits,
                            shuffle=True,
                            random_state=random_state).split(
                                feature_data, result_data)
    test_pred = np.zeros(feature_data.shape[0])
    test_proba = np.zeros(feature_data.shape[0])
    acc_scores = np.zeros(n_splits)
    recall_scores = np.zeros(n_splits)
    mcc_scores = np.zeros(n_splits)
    f1_scores = np.zeros(n_splits)
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = feature_data[train_idx]
        Y_train = result_data[train_idx]
        X_test = feature_data[test_idx]
        Y_test = result_data[test_idx]
        config = get_toy_config()
        gc = GCForest(config)  # should be a dict
        X_train_enc = gc.fit_transform(X_train, Y_train)
        part_X_train_enc = X_train_enc[:, ::2]
        y_pred = gc.predict(X_test)
        X_test_enc = gc.transform(X_test)
        part_X_test_enc = X_test_enc[:, ::2]
        y_proba = gc.predict_proba(X_test)[:, 1]
        acc = accuracy_score(Y_test, y_pred)
        print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(
            acc * 100))
        confmat = confusion_matrix(Y_test, y_pred)
        sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
        sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
        print('1. The acc score of the model {}\n'.format(
            accuracy_score(Y_test, y_pred)))
        print('2. The sp score of the model {}\n'.format(sp))
        print('3. The sn score of the model {}\n'.format(sn))
        print('4. The mcc score of the model {}\n'.format(
            matthews_corrcoef(Y_test, y_pred)))
        print('9. The auc score of the model {}\n'.format(
            roc_auc_score(Y_test, y_proba, average='macro')))
        print('6. The recall score of the model {}\n'.format(
            recall_score(Y_test, y_pred, average='macro')))
        print('5. The F-1 score of the model {}\n'.format(
            f1_score(Y_test, y_pred, average='macro')))
        print('7. Classification report \n {} \n'.format(
            classification_report(Y_test, y_pred)))
        print('8. Confusion matrix \n {} \n'.format(
            confusion_matrix(Y_test, y_pred)))

        recall = recall_score(Y_test, y_pred, average='macro')
        f1 = f1_score(Y_test, y_pred, average='macro')
        acc = accuracy_score(Y_test, y_pred)
        mcc = matthews_corrcoef(Y_test, y_pred)

        recall_scores[j] = recall
        f1_scores[j] = f1
        acc_scores[j] = acc
        mcc_scores[j] = mcc

        test_pred[test_idx] = y_pred
        test_proba[test_idx] = y_proba
        print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}".
              format(j, recall, acc, mcc, f1))
    confmat = confusion_matrix(result_data, test_pred)
    sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
    sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
    print(
        "--------------------------------------深度森林------------------------------------"
    )
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, test_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, test_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, test_proba, average='macro')))
    print('6. The recall score of the model {}\n'.format(
        recall_score(result_data, test_pred, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, test_pred, average='macro')))
    print('7. Classification report \n {} \n'.format(
        classification_report(result_data, test_pred)))
    print('8. Confusion matrix \n {} \n'.format(
        confusion_matrix(result_data, test_pred)))
Esempio n. 26
0
    for i in range(0, len(val_labels)):
        if val_labels[i] == 0:
            val_fea_0.append(val_fea[i])
        else:
            val_fea_1.append(val_fea[i])
    test_fea = val_fea_1[:int(len(val_fea_1) /
                              2)] + val_fea_0[:int(len(val_fea_0) / 2)]
    test_labels = [1] * int(len(val_fea_1) / 2) + [0] * int(len(val_fea_0) / 2)
    train_fea = val_fea_1[int(len(val_fea_1) / 2
                              ):] * 1 + val_fea_0[int(len(val_fea_0) / 2):] * 1
    train_labels = [1] * (len(val_fea_1) - int(len(val_fea_1) / 2)) * 1 + [
        0
    ] * (len(val_fea_0) - int(len(val_fea_0) / 2)) * 1
    train_data = [[t, l] for t, l in zip(train_fea, train_labels)]
    test_data = [[d, l] for d, l in zip(test_fea, test_labels)]
    random.shuffle(train_data)
    random.shuffle(test_data)
    test_fea = [d[0] for d in test_data]
    test_labels = [d[1] for d in test_data]
    train_fea = [d[0] for d in train_data]
    train_labels = [d[1] for d in train_data]
    gc = GCForest(get_toy_config())  # should be a dict
    X_train_enc = gc.fit_transform(np.array(train_fea), np.array(train_labels))
    i = 0
    while os.path.exists('./gcForest_model/' + str(i)):
        i += 1
    os.makedirs('./gcForest_model/' + str(i))
    #pickle.dump(gc,open('./gcForest_model/'+ str(i)+'/model.pkl','wb+'),protocol=True)
    y_pred = gc.predict(np.array(test_fea))
    print(classification_report(test_labels, y_pred))
Esempio n. 27
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_50, folds_recall_100 = [], []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        print(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_100 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                     temp.iloc[:25, :][2],
                                                     pos_label=1,
                                                     average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_recall_50.append(recall_50)
        folds_recall_100.append(recall_100)
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)

    return [
        Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing,
        Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing,
        folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing,
        Avg_metrics3_training, folds_recall_50, folds_recall_100
    ], [test_true_predict_compare, train_true_predict_compare]
Esempio n. 28
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer, mode,
        seed):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], []
    folds_G_mean = []
    i = 0
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_25 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                    temp.iloc[:25, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test,
                                          labels=[0, 1]).ravel()
        specificity = float(tn) / float(tn + fp)
        recall = metrics3_testing[1]
        G_mean = np.sqrt(recall * specificity)

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_G_mean.append(G_mean)
        folds_recall_25.append(recall_25)
        folds_recall_50.append(recall_50)
        i += 1
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)
    Avg_G_mean = np.mean(folds_G_mean)

    return [
        Avg_AUPR_training,
        Avg_AUPR_testing,
        folds_AUPR_testing,  #012
        Avg_AUC_training,
        Avg_AUC_testing,
        folds_AUC_testing,  #345
        folds_AUPR_training,
        folds_AUC_training,  #67
        Avg_metrics3_testing,
        Avg_metrics3_training,  #89
        folds_recall_25,
        folds_recall_50,
        folds_G_mean
    ], [test_true_predict_compare, train_true_predict_compare
        ]  #folds_recall_100, folds_recall_200, folds_recall_400,
Esempio n. 29
0
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from dimension_reduction import elasticNet
import utils.tools as utils

from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json

start = time.time()
path1 = 'gcforest4.json'
config = load_json(path1)
gc = GCForest(config)

mask_data = sio.loadmat('yeast_elastic_mask_scale_0.03_0.1.mat')
mask = mask_data.get('yeast_elastic_mask')
extraction = sio.loadmat('yeast_feature_end.mat')
proteinA = extraction.get('feature_A')
protein_A = np.array(proteinA)
proteinB = extraction.get('feature_B')
protein_B = np.array(proteinB)
X_ = np.concatenate((protein_A, protein_B), axis=1)
X_ = np.array(X_)
[row, column] = np.shape(X_)
label_P = np.ones(int(row / 2))
label_N = np.zeros(int(row / 2))
label_ = np.hstack((label_P, label_N))
y_raw = np.mat(label_)
Esempio n. 30
0
    args = parser.parse_args()
    return args


if __name__ == "__main__":

    # config
    args = parse_args()
    if args.model == 'ca':
        config = load_json('./mnist-ca.json')
    elif args.model == 'gc':
        config = load_json('./mnist-gc.json')
    else:
        config = load_json('./mnist-gc.json')

    gc = GCForest(config)

    # gc.set_keep_model_in_mem(False)
    gc.set_keep_model_in_mem(True)

    # data
    data_num_train = 60000  # The number of figures
    data_num_test = 10000  # test num
    fig_w = 45  # width of each figure

    X_train = np.fromfile("./data/mnist_train/mnist_train_data",
                          dtype=np.uint8)
    y_train = np.fromfile("./data/mnist_train/mnist_train_label",
                          dtype=np.uint8)
    X_test = np.fromfile("./data/mnist_test/mnist_test_data", dtype=np.uint8)
    y_test = np.fromfile("./data/mnist_test/mnist_test_label", dtype=np.uint8)