Example #1
0
def simple_statistic(comb):
    resres=[]
    for train, test in tqdm(list(sfolder.split(data_x,data_y))):
#        break
        cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets']
#        stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first']
        X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:]
        Y_train, Y_test = data_y[train], data_y[test]
        x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42)
        
        smo=SMOTE(random_state=42,ratio={1:2000})
        x_train_s,y_train_s=smo.fit_sample(x_train,y_train)
        
        ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s
        x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns)
        X_train_s=pd.concat([x_train_s,x_val],axis=0)
        Y_train_s=list(y_train_s)
        Y_train_s.extend(list(y_val))
        Y_train_s=np.array(Y_train_s)
        
        best_combination_nowfold=comb
        for sts in best_combination_nowfold:
            for column in x_train.columns:
                if(sts == column.split('_')[0]):
                    cofff.append(column)
        
        x_train_train=X_train_s[cofff]
        y_train_train=Y_train_s
        x_test=X_test[cofff]
        y_test=Y_test
    
        ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing")
        ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True)
        ensemble.add_meta(LogisticRegression(),proba=True)
        print('now is here -4\n')
        ensemble.fit(x_train_train,y_train_train)
        print('now is here -5\n')
        preds_prob=ensemble.predict_proba(x_test)
        print('now is here -6\n')
        prob=preds_prob[:, 1]
        preds=[]
        for i in prob:
            if i>=0.5:
                preds.append(1);
            else:
                preds.append(0)
                
        auc_sl=roc_auc_score(y_test,preds_prob[:,1])
        auprc_sl=average_precision_score(y_test,preds_prob[:,1])
        recall_sl=recall_score(y_test,preds)
        acc_sl=accuracy_score(y_test,preds)
        p_sl=precision_score(y_test,preds)
        f1_sl=f1_score(y_test,preds)
        fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob)
        print('now is here -7')
        resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl])
    return resres
Example #2
0
def stacking_training (X,y,X_pred,layer_list,meta_learner):
    stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False)
    for each in layer_list:
        stacking_in_layer.add(each,proba=True)
        print ('基学习器添加成功')
    stacking_in_layer.add_meta(meta_learner,proba= True)
    print ('元学习器添加成功')
    print ('拟合中')
    stacking_in_layer.fit(X,y)
    pred_proba = stacking_in_layer.predict_proba(X_pred)
    return pred_proba,stacking_in_layer
Example #3
0
def use_pack():
    sl = SuperLearner(
        folds=10,
        random_state=SEED,
        verbose=2,
        # backend="multiprocessing"
    )
    # Add the base learners and the meta learner
    sl.add(list(base_learners.values()), proba=True)
    sl.add_meta(meta_learner, proba=True)
    # Train the ensemble
    sl.fit(xtrain, ytrain)
    # Predict the test set
    p_sl = sl.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
Example #4
0
cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner),
                                             xtrain.values, ytrain.values,
                                             KFold(2))

P_pred, p = ensemble_predict(cv_base_learners,
                             cv_meta_learner,
                             xtest,
                             verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))  # 0.881

## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升!
# Instantiate the ensemble with 10 folds
sl = SuperLearner(folds=10,
                  random_state=SEED,
                  verbose=2,
                  backend="multiprocessing")

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
               "Super Learner", 'ROC_curve_with_super_learning')  # 0.890
Example #5
0
sl = SuperLearner(
    folds=10,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True) 
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(X_train_sc, y_train_sc)

# Predict the test set
p_sl = sl.predict_proba(X_test_sc)

# print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(y_test_sc, p_sl[:, 1]))


# In[119]:


pp = []
for p in p_sl[:, 1]:
    if p>0.5:
        pp.append(1.)
    else:
        pp.append(0.)

Example #6
0
def main():
    # Open and read in train x, train y, and scaled test data
    with open('AviationData_cleaned_V3.csv', 'r') as input_all:
        df_raw = pd.read_csv(input_all, encoding = 'utf-8')
    
    # Final check on NA values from 
    print('Check number of NA values from selected columns:\n',
          df_raw.isnull().sum())
    
    # Drop rows containing NA values and reset index
    df_raw.dropna(axis=0, inplace = True)
    df_raw.reset_index(drop = True, inplace = True)
    
    # Prepare response label
    df_raw['Injury Severity']= df_raw['Injury Severity'].replace('Incident', 'Non-Fatal') 

    # Separate the two classes in the original dataset
    df_none = df_raw.loc[df_raw['Injury Severity'] == 'Non-Fatal']
    df_fatl = df_raw.loc[df_raw['Injury Severity'] == 'Fatal']
    
    # Balance Dataset
    n_fatl = len(df_fatl)
    df_none = df_none.sample(n = n_fatl, replace = False, random_state = 117)
    
    # Re-construct dataset
    df_sampled = pd.concat([df_none,df_fatl], ignore_index=True)
    df_sampled.reset_index(drop = True, inplace = True)

    # Separate predictors and response
    df_X = df_sampled.drop(['Injury Severity', 'Airport Code'], axis = 1)
    df_y = df_sampled.loc[: ,  'Injury Severity' ]
    
    # Convert string response to numerical response fro convenience
    df_y.replace('Non-Fatal', '0', inplace = True)
    df_y.replace('Fatal', '1', inplace = True)
    
    # Define and apply one-hot encoder to encode predictors
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df_X)
    df_X = pd.DataFrame(enc.transform(df_X).toarray(), columns = enc.get_feature_names(list(df_X.columns)))
    
    # Separate train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.5, random_state=1378)
    
    # Recude dataset dimension
    #X_train, X_test = dimension_reduction(X_train, y_train, X_test, 80 , method = 'PCA')
  
    # Define MLP classifier
    clf_mlp = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', 
                            alpha=0.0001, batch_size='auto', learning_rate='constant', 
                            learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                            random_state=117, tol=0.0001, verbose=False, warm_start=False, 
                            momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                            validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                            n_iter_no_change=10)
    
    # Define XGBoost classifier
    clf_xgb = xgb.XGBClassifier(booster='gbtree',
                               objective= 'binary:logistic',
                               eval_metric='logloss',
                               tree_method= 'auto',
                               max_depth= 6,
                               min_child_weight= 1,
                               gamma = 0,
                               subsample= 1,
                               colsample_bytree = 1,
                               reg_alpha = 0,
                               reg_lambda = 1,
                               learning_rate = 0.1,
                               seed=27)
    
    # Define LGB Classifier
    clf_lgb = lgb.LGBMClassifier(objective = 'binary',
                                    boosting = 'gbdt',
                                    metric = 'binary_logloss',
                                    num_leaves = 15,
                                    min_data_in_leaf = 10,
                                    max_depth = 5,
                                    bagging_fraction = 0.85,
                                    bagging_freq = 11,
                                    feature_fraction = 0.5,
                                    lambda_l1 = 0.01,
                                    lambda_l2 = 0.3,
                                    num_iterations = 100,
                                    learning_rate = 0.08,
                                    random_state = 117)
    
    # Define random forest classifier
    clf_rf = RandomForestClassifier(n_estimators=300, criterion='gini', 
                                    max_depth=None, min_samples_split=2, 
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                    max_features='auto', random_state = 117)
    
    
    # Fit base learners using whole train dataset
    clf_mlp.fit(X_train,y_train)
    clf_xgb.fit(X_train,y_train)
    clf_lgb.fit(X_train,y_train)
    clf_rf.fit(X_train,y_train)
    
    # Generate predicted probability using base learners
    mlp_proba = clf_mlp.predict_proba(X_test)[:, 1]
    xgb_proba = clf_xgb.predict_proba(X_test)[:, 1]
    lgb_proba = clf_lgb.predict_proba(X_test)[:, 1]
    rf_proba = clf_lgb.predict_proba(X_test)[:, 1]
    
    # Initialize prediction using base learners' results
    pred_mlp = pd.Series(np.full(len(y_test), 0)) 
    pred_xgb = pd.Series(np.full(len(y_test), 0)) 
    pred_lgb = pd.Series(np.full(len(y_test), 0)) 
    pred_rf = pd.Series(np.full(len(y_test), 0)) 
    
    # Set threshold
    thres_mlp = 0.5
    thres_xgb = 0.5
    thres_lgb = 0.5
    thres_rf = 0.5
    
    # Make final prediction
    pred_mlp[mlp_proba >= thres_mlp] = 1
    pred_xgb[xgb_proba >= thres_xgb] = 1
    pred_lgb[lgb_proba >= thres_lgb] = 1
    pred_rf[rf_proba >= thres_rf] = 1
    
    # Map test data response into integers
    y_test = list(map(int, y_test))
    
    # Generate prediction report using base learners
    print('\n\nMLP:')
    print_validate(y_test, pred_mlp)
    
    print('\n\nXGB:')
    print_validate(y_test, pred_xgb)
    
    print('\n\nLGB:')
    print_validate(y_test, pred_lgb)
    
    print('\n\nRF:')
    print_validate(y_test, pred_rf)
    
    # Set base learner dictionary
    base_learners = {'mlp': clf_mlp,
                    'xgb': clf_xgb,
                    'lgb' : clf_lgb,
                    'rf': clf_rf
                    }
    
    # Define super learner
    sup_learner = SuperLearner(
                random_state=117
                )
    
    # Add the base learners and the meta learner
    sup_learner.add(list(base_learners.values()), proba = True)
    sup_learner.add_meta(linear_model.BayesianRidge(alpha_1 = 1e-3))
    
    # Train the ensemble
    sup_learner.fit(X_train,y_train)
    
    # Make prediction using super learner
    sl_proba = sup_learner.predict_proba(X_test)
    pred_sl = pd.Series(np.full(len(y_test), 0)) 
    thres_sl = 0.5
    pred_sl[sl_proba >= thres_sl] = 1
    
    print('\n\nSL:')
    print_validate(y_test, pred_sl)
    
    # ROC Curves for test dataset
    plt.figure(figsize=(8,7))
    draw_roc(y_test, sl_proba, 'Super Learner', 'tab:cyan', '-')
    draw_roc(y_test, mlp_proba, 'MLP NN', 'royalblue', '-')
    draw_roc(y_test, xgb_proba, 'XGBoost', 'lightcoral', '--')
    draw_roc(y_test, lgb_proba, 'LightGBM', 'seagreen', '-.')
    draw_roc(y_test, rf_proba, 'Random Forest', 'darkorange', '-')
    
    plt.plot([0, 1], [0, 1], 'k--', lw = 4)
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.02])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Test Result')
    plt.legend(loc="lower right", fontsize = 14, handlelength=4)
    plt.show()
Example #7
0
data = load_iris()
idx = np.random.permutation(150)
X = data.data[idx]
y = data.target[idx]


from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# --- Build ---
# Passing a scoring function will create cv scores during fitting
# the scorer should be a simple function accepting to vectors and returning a scalar
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression())

# --- Use ---

# Fit ensemble
ensemble.fit(X[:75], y[:75])

# Predict
preds = ensemble.predict_proba(X[75:])
print(preds)
Example #8
0
                                 xtest,
                                 verbose=False)
    print("\nEnsemble (Stacking) ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p))

    # Instantiate the ensemble with 10 folds
    ensemble = SuperLearner(folds=10,
                            random_state=SEED,
                            verbose=2,
                            backend="multiprocessing")

    # Add the base learners and the meta learner
    ensemble.add(list(base_learners.values()), proba=True)
    ensemble.add_meta(meta_learner, proba=True)

    # Train the ensemble
    ensemble.fit(xtrain, ytrain)

    # Predict the test set
    p_sl = ensemble.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
    plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
                   "Super Learner")

    print('-------------------------------------')
    print(test.head())
    y_pred = ensemble.predict(test.iloc[:, 1:].values)
    print(y_pred)
Example #9
0
stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first']
cof=['age','gender_F','gender_M','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','AIDS','HEM','METS']
id1 = [i for i,x in enumerate(indiv) if x==1]
stats=[stats_list[i] for i in id1]
for sts in stats:
    for column in dataset.columns:
        if(sts == column.split('_')[0]):
            cof.append(column);
data_x=dataset[cof]
data_y=data_all['hosp_flag']
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3)
ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing")
ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True)
ensemble.add_meta(LogisticRegression(),proba=True)
ensemble.fit(x_train, y_train)
preds_prob=ensemble.predict_proba(x_test)
prob=preds_prob[:, 1]
preds=[]
for i in prob:
    if i>=0.5:
        preds.append(1);
    else:
        preds.append(0)
        
auc_sl=roc_auc_score(y_test,preds_prob[:,1])
auprc_sl=average_precision_score(y_test,preds_prob[:,1])
recall_sl=recall_score(preds,y_test)

#超级学习器灵敏度和特异度
fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob)
Example #10
0
    }
    return models

meta_learner = GradientBoostingClassifier(
    n_estimators = 200,
    loss = 'exponential',
    max_features = 4,
    max_depth = 3,
    subsample = 0.5,
    random_state = SEED,
)

s2 = SuperLearner(
    folds = 10,
    random_state = SEED,
    verbose = 2
)

base_learners2 = get_models()

s2.add(list(base_learners2.values()), proba=True)#!!
s2.add_meta(meta_learner, proba=True)
s2.fit(xstd, ytrain.values)

p_mlens2 = s2.predict_proba(xvstd)[:, 1]
roc_auc_score(yvalid, p_mlens2[:,1])
result12 = pd.DataFrame(p_mlens2[:,1], index=test.PERSONID)
result12.to_csv('result12_637f.csv', sep='\t', header=False)