Example #1
0
def get_stacked_model(X, y, is_processing=True):
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)
    preprocessers = [StandardScaler()] if is_processing else []
    ensemble.add([MyClassifier(5.0)], preprocessing=preprocessers)
    ensemble.add_meta(MyClassifier(0.5))
    ensemble.fit(X, y)
    return ensemble
Example #2
0
def get_stacked_model(X, y):
    ensemble = SuperLearner(scorer=f1, random_state=seed)
    ensemble.add([RandomForestClassifier(random_state=seed), SVC()])
    ensemble.add_meta(LogisticRegression())
    ensemble.fit(X, y)
    print('f1-score in training')
    print('-m: mean. -s: std')
    print(pd.DataFrame(ensemble.data))
    return ensemble
Example #3
0
def simple_statistic(comb):
    resres=[]
    for train, test in tqdm(list(sfolder.split(data_x,data_y))):
#        break
        cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets']
#        stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first']
        X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:]
        Y_train, Y_test = data_y[train], data_y[test]
        x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42)
        
        smo=SMOTE(random_state=42,ratio={1:2000})
        x_train_s,y_train_s=smo.fit_sample(x_train,y_train)
        
        ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s
        x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns)
        X_train_s=pd.concat([x_train_s,x_val],axis=0)
        Y_train_s=list(y_train_s)
        Y_train_s.extend(list(y_val))
        Y_train_s=np.array(Y_train_s)
        
        best_combination_nowfold=comb
        for sts in best_combination_nowfold:
            for column in x_train.columns:
                if(sts == column.split('_')[0]):
                    cofff.append(column)
        
        x_train_train=X_train_s[cofff]
        y_train_train=Y_train_s
        x_test=X_test[cofff]
        y_test=Y_test
    
        ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing")
        ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True)
        ensemble.add_meta(LogisticRegression(),proba=True)
        print('now is here -4\n')
        ensemble.fit(x_train_train,y_train_train)
        print('now is here -5\n')
        preds_prob=ensemble.predict_proba(x_test)
        print('now is here -6\n')
        prob=preds_prob[:, 1]
        preds=[]
        for i in prob:
            if i>=0.5:
                preds.append(1);
            else:
                preds.append(0)
                
        auc_sl=roc_auc_score(y_test,preds_prob[:,1])
        auprc_sl=average_precision_score(y_test,preds_prob[:,1])
        recall_sl=recall_score(y_test,preds)
        acc_sl=accuracy_score(y_test,preds)
        p_sl=precision_score(y_test,preds)
        f1_sl=f1_score(y_test,preds)
        fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob)
        print('now is here -7')
        resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl])
    return resres
Example #4
0
def stacking_training (X,y,X_pred,layer_list,meta_learner):
    stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False)
    for each in layer_list:
        stacking_in_layer.add(each,proba=True)
        print ('基学习器添加成功')
    stacking_in_layer.add_meta(meta_learner,proba= True)
    print ('元学习器添加成功')
    print ('拟合中')
    stacking_in_layer.fit(X,y)
    pred_proba = stacking_in_layer.predict_proba(X_pred)
    return pred_proba,stacking_in_layer
Example #5
0
def get_stacked_model(X, y):
    ensemble = SuperLearner(scorer=accuracy, random_state=seed)
    # call predict_proba instead of predict
    ensemble.add(
        [SVC(probability=True),
         RandomForestClassifier(random_state=seed)],
        proba=True)
    ensemble.add_meta(LogisticRegression())
    ensemble.fit(X, y)
    print('accuracy score in training')
    print('-m: mean. -s: std')
    print(pd.DataFrame(ensemble.data))
    return ensemble
Example #6
0
def esemble(data,data2,data5,during):
    ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2)
    ensemble.add(linear_model.LinearRegression())
    ensemble.add_meta([GaussianProcessRegressor()])
    y = data2['prmom'+during+'_f']
    x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1)
    x=x.fillna(0)
    y=np.array(y)
    x=np.array(x)
    ensemble.fit(x,y)
    X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1)
    X=X.fillna(0)
    X=np.array(X)
    preds = ensemble.predict(X)
    data['pred_essemble']=preds
    return data
Example #7
0
def use_pack():
    sl = SuperLearner(
        folds=10,
        random_state=SEED,
        verbose=2,
        # backend="multiprocessing"
    )
    # Add the base learners and the meta learner
    sl.add(list(base_learners.values()), proba=True)
    sl.add_meta(meta_learner, proba=True)
    # Train the ensemble
    sl.fit(xtrain, ytrain)
    # Predict the test set
    p_sl = sl.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
Example #8
0
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test):

    all_objects = [
        "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork",
        "Flask", "Bowl"
    ]

    ensemble = SuperLearner(folds=10,
                            random_state=seed,
                            verbose=2,
                            backend="multiprocessing",
                            scorer=accuracy_score)

    layer_1 = [SVC(kernel='linear', C=8)]
    ensemble.add(layer_1)

    # 95.50
    """Make plots of learning curve"""

    ensemble.add_meta(
        AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=8,
                                   min_samples_split=5,
                                   min_samples_leaf=8)))

    ensemble.fit(X_train, y_train)

    import time

    start = time.time()

    yhat = ensemble.predict(X_test)

    accuracies = cross_val_score(ensemble,
                                 X_test,
                                 y_test,
                                 cv=10,
                                 scoring="accuracy")

    print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100))
    print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() *
                                                            100))
Example #9
0
def test_equivalence_super_learner():
    """[SequentialEnsemble] Test ensemble equivalence with SuperLearner."""
    ens = SuperLearner()
    seq = SequentialEnsemble()

    ens.add(ECM, dtype=np.float64)
    seq.add('stack', ECM, dtype=np.float64)

    F = ens.fit(X, y).predict(X)
    P = seq.fit(X, y).predict(X)

    np.testing.assert_array_equal(P, F)
Example #10
0
def test_subset_equiv():
    """[Subsemble] Test equivalence with SuperLearner for J=1."""

    sub = Subsemble(partitions=1)
    sl = SuperLearner()

    sub.add(ECM, dtype=np.float64)
    sl.add(ECM, dtype=np.float64)

    F = sub.fit(X, y).predict(X)
    P = sl.fit(X, y).predict(X)

    np.testing.assert_array_equal(P, F)
def train_model(ensemble, X, y) :
    seed = 2017
    np.random.seed(seed)


    # --- Build ---
    # Passing a scoring function will create cv scores during fitting
    # the scorer should be a simple function accepting to vectors and returning a scalar
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

    # Build the first layer
    # ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

    ensemble.add([IsolationForest(), LOF(novelty=True)])


    # Attach the final meta estimator
    # ensemble.add_meta(LogisticRegression())

    ensemble.add_meta(OCSVM())

    # Fit ensemble
    ensemble.fit(X, y)
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test):
    # Establish and reset variables
    acc_score_cv = None
    acc_score = None
    time_ = None
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

    ensemble.add(models)
    # Attach the final meta estimator
    ensemble.add_meta(SVC())

    start = time.time()
    ensemble.fit(X_train, Y_train)
    preds = ensemble.predict(X_test)
    acc_score = accuracy_score(preds, Y_test)
    end = time.time()
    time_ = end - start

    return {
        "Ensemble": name,
        "Meta_Classifier": "SVC",
        "Accuracy_Score": acc_score,
        "Runtime": time_
    }
Example #13
0
from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=10,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True) 
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(X_train_sc, y_train_sc)

# Predict the test set
p_sl = sl.predict_proba(X_test_sc)

# print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(y_test_sc, p_sl[:, 1]))


# In[119]:


pp = []
for p in p_sl[:, 1]:
    if p>0.5:
        pp.append(1.)
    else:
Example #14
0
# Initial layer, propagate as before
ensemble.add(estimators, propagate_features=[0, 1])

# Intermediate layer, keep propagating, but add a preprocessing
# pipeline that selects a subset of the input
ensemble.add(estimators,
             preprocessing=[Subset([2, 3])],
             propagate_features=[0, 1])

##############################################################################
# In the above example, the two first features of the original input data
# will be propagated through both layers, but the second layer will not be
# trained on it. Instead, it will only see the predictions made by the base
# learners in the first layer.

ensemble.fit(X, y)
n = list(ensemble.layer_2.learners[0].learner
         )[0].estimator.feature_importances_.shape[0]
m = ensemble.predict(X).shape[1]
print("Num features seen by estimators in intermediate layer: %i" % n)
print("Num features in the output array of the intermediate layer: %i" % m)

##############################################################################
# .. _proba-tutorial:
#
# Probabilistic ensemble learning
# -------------------------------
#
# When the target to predict is a class label, it can often be beneficial to
# let higher-order layers or the meta learner learn from *class probabilities*,
# as opposed to the predicted class. Scikit-learn classifiers can return a
Example #15
0
seed = 2017
np.random.seed(seed)

data = load_iris()
idx = np.random.permutation(150)
X = data.data[idx]
y = data.target[idx]

# Building an ensemble
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# --- Multi-layer ensembles ---

ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

# Build first layer
ensemble.add([RandomForestClassifier(random_state=seed), LogisticRegression()])

# Build the second layer
ensemble.add([LogisticRegression(), SVC()])

# Attach final meta estimator
ensemble.add_meta(SVC())

ensemble.fit(X[:75], y[:75])
preds = ensemble.predict(X[75:])
print("Fit data:\n%r" % ensemble.data)
# Passing a scoring function will create cv scores during fitting
# the scorer should be a simple function accepting to vectors and returning a scalar
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression())

## Use the model for training and testing
# start counting time for training
time_train_start = time.clock()

# Fit ensemble
ensemble.fit(training_data, training_labels)

# print training time
time_train_end = time.clock()
print("Training finished, training time: %g seconds \n" %
      (time_train_end - time_train_start))

# start counting time for testing
time_test_start = time.clock()

# Predict
preds = ensemble.predict(test_data)

# print testing time
time_test_end = time.clock()
print("Testing finished, testing time: %g seconds  \n" %
Example #17
0
def main():
    # Open and read in train x, train y, and scaled test data
    with open('AviationData_cleaned_V3.csv', 'r') as input_all:
        df_raw = pd.read_csv(input_all, encoding = 'utf-8')
    
    # Final check on NA values from 
    print('Check number of NA values from selected columns:\n',
          df_raw.isnull().sum())
    
    # Drop rows containing NA values and reset index
    df_raw.dropna(axis=0, inplace = True)
    df_raw.reset_index(drop = True, inplace = True)
    
    # Prepare response label
    df_raw['Injury Severity']= df_raw['Injury Severity'].replace('Incident', 'Non-Fatal') 

    # Separate the two classes in the original dataset
    df_none = df_raw.loc[df_raw['Injury Severity'] == 'Non-Fatal']
    df_fatl = df_raw.loc[df_raw['Injury Severity'] == 'Fatal']
    
    # Balance Dataset
    n_fatl = len(df_fatl)
    df_none = df_none.sample(n = n_fatl, replace = False, random_state = 117)
    
    # Re-construct dataset
    df_sampled = pd.concat([df_none,df_fatl], ignore_index=True)
    df_sampled.reset_index(drop = True, inplace = True)

    # Separate predictors and response
    df_X = df_sampled.drop(['Injury Severity', 'Airport Code'], axis = 1)
    df_y = df_sampled.loc[: ,  'Injury Severity' ]
    
    # Convert string response to numerical response fro convenience
    df_y.replace('Non-Fatal', '0', inplace = True)
    df_y.replace('Fatal', '1', inplace = True)
    
    # Define and apply one-hot encoder to encode predictors
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df_X)
    df_X = pd.DataFrame(enc.transform(df_X).toarray(), columns = enc.get_feature_names(list(df_X.columns)))
    
    # Separate train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.5, random_state=1378)
    
    # Recude dataset dimension
    #X_train, X_test = dimension_reduction(X_train, y_train, X_test, 80 , method = 'PCA')
  
    # Define MLP classifier
    clf_mlp = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', 
                            alpha=0.0001, batch_size='auto', learning_rate='constant', 
                            learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                            random_state=117, tol=0.0001, verbose=False, warm_start=False, 
                            momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                            validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                            n_iter_no_change=10)
    
    # Define XGBoost classifier
    clf_xgb = xgb.XGBClassifier(booster='gbtree',
                               objective= 'binary:logistic',
                               eval_metric='logloss',
                               tree_method= 'auto',
                               max_depth= 6,
                               min_child_weight= 1,
                               gamma = 0,
                               subsample= 1,
                               colsample_bytree = 1,
                               reg_alpha = 0,
                               reg_lambda = 1,
                               learning_rate = 0.1,
                               seed=27)
    
    # Define LGB Classifier
    clf_lgb = lgb.LGBMClassifier(objective = 'binary',
                                    boosting = 'gbdt',
                                    metric = 'binary_logloss',
                                    num_leaves = 15,
                                    min_data_in_leaf = 10,
                                    max_depth = 5,
                                    bagging_fraction = 0.85,
                                    bagging_freq = 11,
                                    feature_fraction = 0.5,
                                    lambda_l1 = 0.01,
                                    lambda_l2 = 0.3,
                                    num_iterations = 100,
                                    learning_rate = 0.08,
                                    random_state = 117)
    
    # Define random forest classifier
    clf_rf = RandomForestClassifier(n_estimators=300, criterion='gini', 
                                    max_depth=None, min_samples_split=2, 
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                    max_features='auto', random_state = 117)
    
    
    # Fit base learners using whole train dataset
    clf_mlp.fit(X_train,y_train)
    clf_xgb.fit(X_train,y_train)
    clf_lgb.fit(X_train,y_train)
    clf_rf.fit(X_train,y_train)
    
    # Generate predicted probability using base learners
    mlp_proba = clf_mlp.predict_proba(X_test)[:, 1]
    xgb_proba = clf_xgb.predict_proba(X_test)[:, 1]
    lgb_proba = clf_lgb.predict_proba(X_test)[:, 1]
    rf_proba = clf_lgb.predict_proba(X_test)[:, 1]
    
    # Initialize prediction using base learners' results
    pred_mlp = pd.Series(np.full(len(y_test), 0)) 
    pred_xgb = pd.Series(np.full(len(y_test), 0)) 
    pred_lgb = pd.Series(np.full(len(y_test), 0)) 
    pred_rf = pd.Series(np.full(len(y_test), 0)) 
    
    # Set threshold
    thres_mlp = 0.5
    thres_xgb = 0.5
    thres_lgb = 0.5
    thres_rf = 0.5
    
    # Make final prediction
    pred_mlp[mlp_proba >= thres_mlp] = 1
    pred_xgb[xgb_proba >= thres_xgb] = 1
    pred_lgb[lgb_proba >= thres_lgb] = 1
    pred_rf[rf_proba >= thres_rf] = 1
    
    # Map test data response into integers
    y_test = list(map(int, y_test))
    
    # Generate prediction report using base learners
    print('\n\nMLP:')
    print_validate(y_test, pred_mlp)
    
    print('\n\nXGB:')
    print_validate(y_test, pred_xgb)
    
    print('\n\nLGB:')
    print_validate(y_test, pred_lgb)
    
    print('\n\nRF:')
    print_validate(y_test, pred_rf)
    
    # Set base learner dictionary
    base_learners = {'mlp': clf_mlp,
                    'xgb': clf_xgb,
                    'lgb' : clf_lgb,
                    'rf': clf_rf
                    }
    
    # Define super learner
    sup_learner = SuperLearner(
                random_state=117
                )
    
    # Add the base learners and the meta learner
    sup_learner.add(list(base_learners.values()), proba = True)
    sup_learner.add_meta(linear_model.BayesianRidge(alpha_1 = 1e-3))
    
    # Train the ensemble
    sup_learner.fit(X_train,y_train)
    
    # Make prediction using super learner
    sl_proba = sup_learner.predict_proba(X_test)
    pred_sl = pd.Series(np.full(len(y_test), 0)) 
    thres_sl = 0.5
    pred_sl[sl_proba >= thres_sl] = 1
    
    print('\n\nSL:')
    print_validate(y_test, pred_sl)
    
    # ROC Curves for test dataset
    plt.figure(figsize=(8,7))
    draw_roc(y_test, sl_proba, 'Super Learner', 'tab:cyan', '-')
    draw_roc(y_test, mlp_proba, 'MLP NN', 'royalblue', '-')
    draw_roc(y_test, xgb_proba, 'XGBoost', 'lightcoral', '--')
    draw_roc(y_test, lgb_proba, 'LightGBM', 'seagreen', '-.')
    draw_roc(y_test, rf_proba, 'Random Forest', 'darkorange', '-')
    
    plt.plot([0, 1], [0, 1], 'k--', lw = 4)
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.02])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Test Result')
    plt.legend(loc="lower right", fontsize = 14, handlelength=4)
    plt.show()
Example #18
0
    # E.train_base_learners(models,xtrain_base,ytrain_base,True)
    # P_base = E.predict_base_learners(models, xpred_base)
    # meta_learner.fit(P_base, ypred_base)
    # P_pred, p = E.ensemble_predict(models,meta_learner,X_test)
    #
    # print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

    base_learners, meta_learner1 = E.stacking(models, clone(meta_learner),
                                              xtrain_base, ytrain_base,
                                              KFold(2))
    P_pred, p = E.ensemble_predict(base_learners, meta_learner1, X_test)
    print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

    sl = SuperLearner(folds=10,
                      random_state=SEED,
                      verbose=2,
                      backend="multiprocessing")

    # Add the base learners and the meta learner
    sl.add(list(models.values()), proba=True)
    sl.add_meta(meta_learner, proba=True)

    # Train the ensemble
    sl.fit(X_train, y_train)

    # Predict the test set
    p_sl = sl.predict_proba(X_test)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(y_test, p_sl[:, 1]))
Example #19
0
    }
    return models

meta_learner = GradientBoostingClassifier(
    n_estimators = 200,
    loss = 'exponential',
    max_features = 4,
    max_depth = 3,
    subsample = 0.5,
    random_state = SEED,
)

s2 = SuperLearner(
    folds = 10,
    random_state = SEED,
    verbose = 2
)

base_learners2 = get_models()

s2.add(list(base_learners2.values()), proba=True)#!!
s2.add_meta(meta_learner, proba=True)
s2.fit(xstd, ytrain.values)

p_mlens2 = s2.predict_proba(xvstd)[:, 1]
roc_auc_score(yvalid, p_mlens2[:,1])
result12 = pd.DataFrame(p_mlens2[:,1], index=test.PERSONID)
result12.to_csv('result12_637f.csv', sep='\t', header=False)
    

from mlens.ensemble import SuperLearner

val_train, val_test = train_test_split(train,test_size=0.3,random_state=SEED,stratify=train['Survived'])
val_Xtrain=val_train[val_train.columns[1:]]
val_ytrain=val_train[val_train.columns[:1]]
val_Xtest=val[val_test.columns[1:]]
val_ytest=val[val_test.columns[:1]]
# Instantiate the ensemble with 10 folds
super_learner = SuperLearner(folds=10,random_state=SEED,verbose=2,backend='multiprocessing')
# Add the base learners and the meta learner
super_learner.add(list(base_learners().values()),proba=True)
super_learner.add_meta(LogisticRegression(), proba=True)

# Train the ensemble
super_learner.fit(val_Xtrain,val_ytrain)
# predict the test set
p_ens = super_learner.predict(val_Xtest)[:,1]
p_ens_label = 1*(p_ens>=0.5)
print('The acccuracy of super learner:',metrics.accuracy_score(p_ens_label, val_ytest))


# ### Producing the Submission file
# 
# Finally having trained and fit the base and meta learners, we can now output the predictions into the proper format for submission to the Titanic competition as follows:

# In[ ]:


# Generate Submission File 
Submission = pd.DataFrame({ 'PassengerId': PassengerId_test,
Example #21
0
print('Recall:', '%.6f' % recall_score(y_test, ans))
fpr, tpr, thresholds = roc_curve(y_test, ans)
print('AUC:', '%.6f' % auc(fpr, tpr))

#-------------------------------------------------------------------------------------------------#
'''ensemble SL1'''
seed = 2018
np.random.seed(seed)
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)
ensemble.add([
    ExtraTreesClassifier(n_estimators=25, random_state=seed),
    KNeighborsClassifier(n_neighbors=2),
    AdaBoostClassifier(n_estimators=100)
])
ensemble.add_meta(SVC())
ensemble.fit(X_train, y_train)
ans = ensemble.predict(X_test)
FP, FN, TP, TN = conf_matrix(y_test, ans)
print('--------------------Super Learner--------------------')  #test 78.85%
print('Precision:', '%.6f' % precision_score(y_test, ans))
print('Recall:', '%.6f' % recall_score(y_test, ans))
fpr, tpr, thresholds = roc_curve(y_test, ans)
print('AUC:', '%.6f' % auc(fpr, tpr))
'''ensemble SL2'''
#seed = 2018
#np.random.seed(seed)
#ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)
#ensemble.add([ExtraTreesClassifier(n_estimators=30,random_state=seed),AdaBoostClassifier(n_estimators=100)])
#ensemble.add_meta(SVC())
#ensemble.fit(X_train,y_train)
#ans = ensemble.predict(X_test)
Example #22
0
print(Xv)
print(yt)
print(yv)

Xt.fillna(-1)
Xv.fillna(-1)
yt.fillna(-1)
yv.fillna(-1)

print(Xt)
'''
for clf in stacked_clf_list:
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, folds=10)
    ensemble.add(clf[0])
    ensemble.add_meta(lr)
    ensemble.fit(Xt, yt)
    preds = ensemble.predict(Xv)
    accuracy = accuracy_score(preds, yv)

    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
        preds = ensemble.predict(X_test)
        best_preds = preds

    print(f"Accuracy score: {accuracy} {clf[1]}")
    print(
        f"\nBest stacking model is {best_combination[1]} with accuracy of: {best_combination[0]}"
    )  # Output

print(best_preds)
                continue

            num_in_layer = int(layer_weights[j] / weights_total * num_to_slot)

            layerlist = []

            for k in range(num_in_layer):
                layerlist.append(eval_ind.pop())

            ens.add(layerlist)

        # then add the meta model
        ens.add_meta(lgbm(n_estimators=1000, verbose=-1))

        try:
            ens.fit(X_train, y_train)
            train_score = f1_score(ens.predict(X_train), y_train)
            test_score = f1_score(ens.predict(X_test), y_test)
            real_score = train_score * test_score
            print(' Training score is {}'.format(train_score))
            print(' Testing score is {}'.format(test_score))
            print(' Real score is {}'.format(real_score))
        except:
            print(' There was an error with this one. Throwing it out')
            continue

        if real_score > highest_score:
            print(' New highest score found!')
            highest_score = real_score
            winning_model = ens
'''
Example #24
0
cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner),
                                             xtrain.values, ytrain.values,
                                             KFold(2))

P_pred, p = ensemble_predict(cv_base_learners,
                             cv_meta_learner,
                             xtest,
                             verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))  # 0.881

## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升!
# Instantiate the ensemble with 10 folds
sl = SuperLearner(folds=10,
                  random_state=SEED,
                  verbose=2,
                  backend="multiprocessing")

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
               "Super Learner", 'ROC_curve_with_super_learning')  # 0.890
Example #25
0
                                 xtest,
                                 verbose=False)
    print("\nEnsemble (Stacking) ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p))

    # Instantiate the ensemble with 10 folds
    ensemble = SuperLearner(folds=10,
                            random_state=SEED,
                            verbose=2,
                            backend="multiprocessing")

    # Add the base learners and the meta learner
    ensemble.add(list(base_learners.values()), proba=True)
    ensemble.add_meta(meta_learner, proba=True)

    # Train the ensemble
    ensemble.fit(xtrain, ytrain)

    # Predict the test set
    p_sl = ensemble.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
    plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
                   "Super Learner")

    print('-------------------------------------')
    print(test.head())
    y_pred = ensemble.predict(test.iloc[:, 1:].values)
    print(y_pred)
    RandomForestClassifier(random_state=seed, n_estimators=250),
    SVC(),
    LassoLarsIC(criterion='bic'),
    ElasticNet(random_state=0),
    BayesianRidge(),
    MLPClassifier(),
    BaggingClassifier(),
    neighbors.KNeighborsClassifier(),
    tree.DecisionTreeClassifier(),
    GradientBoostingClassifier(n_estimators=200)
])

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression())

ensemble.fit(x_train, y_train)
preds = ensemble.predict(x_test)

ensemble_data = pd.DataFrame(ensemble.data)
auroc = roc_auc_score(preds, y_test)
acc = accuracy_score(preds, y_test)

p = precision_score(preds, y_test)
r = recall_score(preds, y_test)

frp, tpr, threshholds = roc_curve(preds, y_test)

fig = plt.figure()
plt.plot(frp, tpr)
plt.show()
Example #27
0
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# %% Preparing the dataset and the putput label
dataset = np.loadtxt('../dataset/train.csv', dtype=str, delimiter=",")
dataset, outcome = prgm1.pre_processing(dataset)
partition = np.round(0.8 * dataset.shape[0]).__int__()
train_set = dataset[0:partition, :]
test_set = dataset[partition:, :]

# %% Training

test_outcome = np.array(outcome[partition:]).astype(int)
train_outcome = np.array(outcome[0:partition]).astype(int)

seed = 2017
np.random.seed(seed)
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)
# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), SVC()])
# # Attach the final meta estimator
ensemble.add_meta(LogisticRegression())
# # Fit ensemble
ensemble.fit(train_set, train_outcome, gamma="auto")
# # Predict
preds = ensemble.predict(test_set)
print("Fit data:\n%r" % ensemble.data)
print("Prediction score: %.3f" % accuracy_score(preds, test_outcome))