Example #1
0
def build_ensemble(incl_meta, proba, propagate_features=[0, 1]):
    """Return an ensemble."""
    if propagate_features:
        n = len(propagate_features)
        propagate_features_1 = propagate_features
        propagate_features_2 = [i for i in range(n)]
    else:
        propagate_features_1 = propagate_features_2 = None

        #change here
    estimators_layer1 = [xgb]
    estimators_layer2 = [lgb]
    #	estimators_layer3 = [rf,et ...........]

    ensemble = SuperLearner()

    ensemble.add(estimators_layer1,
                 proba=proba,
                 propagate_features=propagate_features)
    #	ensemble.add(estimators_layer2, proba=proba, propagate_features=propagate_features)
    ensemble.add(estimators_layer2, proba=proba)

    if incl_meta:
        ensemble.add_meta(lr)

        return ensemble
Example #2
0
def get_super_learner():
    base_learners = [elastic_net, xgboost, light_gbm]
    meta_learner = LinearRegression(fit_intercept=False)
    ensemble = SuperLearner(folds=2, shuffle=False)
    ensemble.add(base_learners)
    ensemble.add_meta(meta_learner)
    return ensemble
Example #3
0
def get_stacked_model(X, y, is_processing=True):
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)
    preprocessers = [StandardScaler()] if is_processing else []
    ensemble.add([MyClassifier(5.0)], preprocessing=preprocessers)
    ensemble.add_meta(MyClassifier(0.5))
    ensemble.fit(X, y)
    return ensemble
Example #4
0
def build_ensemble(incl_meta,
                   meta_type='log',
                   preprocessors=None,
                   estimators=None,
                   propagate_features=None):
    if propagate_features:
        n = len(propagate_features)
        propagate_features_1 = propagate_features
        propagate_features_2 = [i for i in range(n)]
    else:
        propagate_features_1 = propagate_features_2 = None

    if not estimators:
        estimators = [('rfr', RandomForestRegressor(random_state=seed)),
                      ('svr', SVR()), ('rdg', Ridge())]

    ensemble = SuperLearner()
    ensemble.add(estimators, propagate_features=propagate_features_1)
    ensemble.add(estimators, propagate_features=propagate_features_2)

    if incl_meta & meta_type == 'log':
        ensemble.add_meta(LogisticRegression())
    elif incl_meta & meta_type == 'lin':
        ensemble.add_meta(LinearRegression())

    return ensemble
Example #5
0
def get_ensemble():
    sl = SuperLearner(folds=10,
                      random_state=seed,
                      verbose=2,
                      backend='multiprocessing')
    sl.add(list(get_models().values()), proba=True)
    sl.add_meta(get_meta(), proba=True)
    return sl
Example #6
0
def get_super_learner(X):    
    ensemble = SuperLearner(scorer=rmse, folds=2, shuffle=True, sample_size=len(X))
    # Add base models
    models = get_models()    
    ensemble.add(models)
    # Add the meta model    
    ensemble.add_meta(LinearRegression())    
    return ensemble
Example #7
0
class GatedEnsembleClassifier(_MLensAdapter):
    """Ensemble of classifiers, whose predictions are joined by using
    a further meta-learner, which decides the final output based on the
    prediction of the base classifiers.

    This classifier uses :class:`mlens.ensemble.SuperLearner`
    to implement the *gating* functionality.

    The parameters, and their default values, are:

    - **meta_layer**: Name of the classifier to use as a *meta layer*. By
        default this is `single_layer_perceptron`
    - **folds**: The number of folds to use for cross validation when
        generating the training set for the **meta_layer**. The default
        value for this is `2`.

        For a better explanation of this parameter, see:

        *Polley, Eric C.
        and van der Laan, Mark J., “Super Learner In Prediction” (May 2010).
        U.C. Berkeley Division of Biostatistics Working Paper Series.
        Working Paper 266*
        `<https://biostats.bepress.com/ucbbiostat/paper266/>`_
    """
    def __init__(self, num_features, **kwargs):
        super(GatedEnsembleClassifier, self).__init__()

        kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs}

        self.num_features = num_features
        self.num_folds = kwargs.pop('folds', 2)
        self.meta_layer = kwargs.pop('meta_layer')

        estimators = []
        for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
            model = utils.init_model(clf,
                                     num_features=self.num_features,
                                     **kwargs)

            estimators.append((clf, model.kernel))

        self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds)

        # use as output the probability of a given class (not just
        # the class itself)
        self.kernel.add(estimators, proba=True)

        self.kernel.add_meta(
            utils.init_model(self.meta_layer,
                             len(estimators) * self.num_folds,
                             **kwargs).kernel,
            proba=True,
        )

    def __repr__(self):
        return (f'{self.__class__.__name__}('
                f'num_folds={self.num_folds}, '
                f'meta_layer={self.meta_layer}) ')
Example #8
0
def simple_statistic(comb):
    resres=[]
    for train, test in tqdm(list(sfolder.split(data_x,data_y))):
#        break
        cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets']
#        stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first']
        X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:]
        Y_train, Y_test = data_y[train], data_y[test]
        x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42)
        
        smo=SMOTE(random_state=42,ratio={1:2000})
        x_train_s,y_train_s=smo.fit_sample(x_train,y_train)
        
        ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s
        x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns)
        X_train_s=pd.concat([x_train_s,x_val],axis=0)
        Y_train_s=list(y_train_s)
        Y_train_s.extend(list(y_val))
        Y_train_s=np.array(Y_train_s)
        
        best_combination_nowfold=comb
        for sts in best_combination_nowfold:
            for column in x_train.columns:
                if(sts == column.split('_')[0]):
                    cofff.append(column)
        
        x_train_train=X_train_s[cofff]
        y_train_train=Y_train_s
        x_test=X_test[cofff]
        y_test=Y_test
    
        ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing")
        ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True)
        ensemble.add_meta(LogisticRegression(),proba=True)
        print('now is here -4\n')
        ensemble.fit(x_train_train,y_train_train)
        print('now is here -5\n')
        preds_prob=ensemble.predict_proba(x_test)
        print('now is here -6\n')
        prob=preds_prob[:, 1]
        preds=[]
        for i in prob:
            if i>=0.5:
                preds.append(1);
            else:
                preds.append(0)
                
        auc_sl=roc_auc_score(y_test,preds_prob[:,1])
        auprc_sl=average_precision_score(y_test,preds_prob[:,1])
        recall_sl=recall_score(y_test,preds)
        acc_sl=accuracy_score(y_test,preds)
        p_sl=precision_score(y_test,preds)
        f1_sl=f1_score(y_test,preds)
        fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob)
        print('now is here -7')
        resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl])
    return resres
Example #9
0
def get_stacked_model(X, y):
    ensemble = SuperLearner(scorer=f1, random_state=seed)
    ensemble.add([RandomForestClassifier(random_state=seed), SVC()])
    ensemble.add_meta(LogisticRegression())
    ensemble.fit(X, y)
    print('f1-score in training')
    print('-m: mean. -s: std')
    print(pd.DataFrame(ensemble.data))
    return ensemble
Example #10
0
def get_super_learner(X):
    ensemble = SuperLearner(scorer=accuracy_score,
                            folds=10,
                            shuffle=True,
                            sample_size=len(X))
    # add base models
    models = get_models()
    ensemble.add(models)
    # add the meta model
    ensemble.add_meta(LogisticRegression(solver='lbfgs'))
    return ensemble
Example #11
0
def stacking_training (X,y,X_pred,layer_list,meta_learner):
    stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False)
    for each in layer_list:
        stacking_in_layer.add(each,proba=True)
        print ('基学习器添加成功')
    stacking_in_layer.add_meta(meta_learner,proba= True)
    print ('元学习器添加成功')
    print ('拟合中')
    stacking_in_layer.fit(X,y)
    pred_proba = stacking_in_layer.predict_proba(X_pred)
    return pred_proba,stacking_in_layer
Example #12
0
def get_stacked_model(X, y):
    ensemble = SuperLearner(scorer=accuracy, random_state=seed)
    # call predict_proba instead of predict
    ensemble.add(
        [SVC(probability=True),
         RandomForestClassifier(random_state=seed)],
        proba=True)
    ensemble.add_meta(LogisticRegression())
    ensemble.fit(X, y)
    print('accuracy score in training')
    print('-m: mean. -s: std')
    print(pd.DataFrame(ensemble.data))
    return ensemble
Example #13
0
def esemble(data,data2,data5,during):
    ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2)
    ensemble.add(linear_model.LinearRegression())
    ensemble.add_meta([GaussianProcessRegressor()])
    y = data2['prmom'+during+'_f']
    x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1)
    x=x.fillna(0)
    y=np.array(y)
    x=np.array(x)
    ensemble.fit(x,y)
    X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1)
    X=X.fillna(0)
    X=np.array(X)
    preds = ensemble.predict(X)
    data['pred_essemble']=preds
    return data
Example #14
0
def use_pack():
    sl = SuperLearner(
        folds=10,
        random_state=SEED,
        verbose=2,
        # backend="multiprocessing"
    )
    # Add the base learners and the meta learner
    sl.add(list(base_learners.values()), proba=True)
    sl.add_meta(meta_learner, proba=True)
    # Train the ensemble
    sl.fit(xtrain, ytrain)
    # Predict the test set
    p_sl = sl.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
Example #15
0
def build_ensemble(incl_meta, propagate_features=None):
    if propagate_features:
        n = len(propagate_features)
        propagate_features_1 = propagate_features
        propagate_features_2 = [i for i in range(n)]
    else:
        propagate_features_1 = propagate_features_2 = None

    estimators = [RandomForestRegressor(random_state=seed), SVR()]

    ensemble = SuperLearner()
    ensemble.add(estimators, propagate_features=propagate_features_1)
    ensemble.add(estimators, propagate_features=propagate_features_2)

    if incl_meta:
        ensemble.add_meta(LogisticRegression())

    return ensemble
Example #16
0
def get_model(param: dict) -> BaseEstimator:
    model_name = param.pop('name')
    if model_name == 'xgb':
        return XGBRegressor(**param[model_name])
    elif model_name == 'lgb':
        return LGBMRegressor(**param[model_name])
    elif model_name == 'cb':
        return CatBoostRegressor(**param[model_name])
    elif model_name == 'rf':
        return RandomForestRegressor(**param[model_name])
    elif model_name == 'svm':
        return make_pipeline(StandardScaler(), SVR(**param[model_name]))
    elif model_name == 'knn':
        return make_pipeline(StandardScaler(), KNeighborsRegressor(**param[model_name]))
    elif model_name == 'mlp':
        return make_pipeline(StandardScaler(), MLPRegressor(**param[model_name]))
    elif model_name == 'vote':
        return VotingRegressor(estimators=[
            ('svm', get_model(dict(param, name='svm'))),
            ('rf', get_model(dict(param, name='rf'))),
            ('lgb', get_model(dict(param, name='lgb'))),
            ('knn', get_model(dict(param, name='knn'))),
        ])
    elif model_name == 'stack':
        model = SuperLearner(scorer=mean_squared_error, random_state=132)
        model.add([
            get_model(dict(param, name='svm')),
            get_model(dict(param, name='rf')),
            get_model(dict(param, name='lgb')),
            get_model(dict(param, name='knn')),
        ])
        model.add_meta(GradientBoostingRegressor(random_state=22))
        return model
    elif model_name == 'sk_stack':
        return StackingRegressor(
            estimators=[
                ('svm', get_model(dict(param, name='svm'))),
                ('rf', get_model(dict(param, name='rf'))),
                ('lgb', get_model(dict(param, name='lgb'))),
                ('knn', get_model(dict(param, name='knn'))),
            ],
            final_estimator=GradientBoostingRegressor(random_state=42)
        )
Example #17
0
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test):

    all_objects = [
        "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork",
        "Flask", "Bowl"
    ]

    ensemble = SuperLearner(folds=10,
                            random_state=seed,
                            verbose=2,
                            backend="multiprocessing",
                            scorer=accuracy_score)

    layer_1 = [SVC(kernel='linear', C=8)]
    ensemble.add(layer_1)

    # 95.50
    """Make plots of learning curve"""

    ensemble.add_meta(
        AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=8,
                                   min_samples_split=5,
                                   min_samples_leaf=8)))

    ensemble.fit(X_train, y_train)

    import time

    start = time.time()

    yhat = ensemble.predict(X_test)

    accuracies = cross_val_score(ensemble,
                                 X_test,
                                 y_test,
                                 cv=10,
                                 scoring="accuracy")

    print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100))
    print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() *
                                                            100))
def do_stacking_simple_models(regressors, X, y, w, meta):
    """
    do stacking witht the mlens library.

    :param regressors: a dict of regressors to feed into the ensemble pipeline
    :param X: training dataset
    :param y: outcome varaible y
    :param w: assignment variable
    :param meta: regressor (found in regressors dict for ensemble)

    :return: CATE predictions from the ensemble estimator
    """

    ensemble = SuperLearner(scorer=mean_squared_error, random_state=42)
    ensemble.add([x for x in regressors.values()])
    ensemble.add_meta(regressors[meta])

    e_preds, tau_test = simple_model.create_simple_ml_model(X, y, w, ensemble)

    return e_preds
def train_model(ensemble, X, y) :
    seed = 2017
    np.random.seed(seed)


    # --- Build ---
    # Passing a scoring function will create cv scores during fitting
    # the scorer should be a simple function accepting to vectors and returning a scalar
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

    # Build the first layer
    # ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

    ensemble.add([IsolationForest(), LOF(novelty=True)])


    # Attach the final meta estimator
    # ensemble.add_meta(LogisticRegression())

    ensemble.add_meta(OCSVM())

    # Fit ensemble
    ensemble.fit(X, y)
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test):
    # Establish and reset variables
    acc_score_cv = None
    acc_score = None
    time_ = None
    ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

    ensemble.add(models)
    # Attach the final meta estimator
    ensemble.add_meta(SVC())

    start = time.time()
    ensemble.fit(X_train, Y_train)
    preds = ensemble.predict(X_test)
    acc_score = accuracy_score(preds, Y_test)
    end = time.time()
    time_ = end - start

    return {
        "Ensemble": name,
        "Meta_Classifier": "SVC",
        "Accuracy_Score": acc_score,
        "Runtime": time_
    }
Example #21
0
cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner),
                                             xtrain.values, ytrain.values,
                                             KFold(2))

P_pred, p = ensemble_predict(cv_base_learners,
                             cv_meta_learner,
                             xtest,
                             verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))  # 0.881

## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升!
# Instantiate the ensemble with 10 folds
sl = SuperLearner(folds=10,
                  random_state=SEED,
                  verbose=2,
                  backend="multiprocessing")

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
               "Super Learner", 'ROC_curve_with_super_learning')  # 0.890
Example #22
0
print('Precision:', '%.6f' % precision_score(y_test, ans))
print('Recall:', '%.6f' % recall_score(y_test, ans))
fpr, tpr, thresholds = roc_curve(y_test, ans)
print('AUC:', '%.6f' % auc(fpr, tpr))

#-------------------------------------------------------------------------------------------------#
'''ensemble SL1'''
seed = 2018
np.random.seed(seed)
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)
ensemble.add([
    ExtraTreesClassifier(n_estimators=25, random_state=seed),
    KNeighborsClassifier(n_neighbors=2),
    AdaBoostClassifier(n_estimators=100)
])
ensemble.add_meta(SVC())
ensemble.fit(X_train, y_train)
ans = ensemble.predict(X_test)
FP, FN, TP, TN = conf_matrix(y_test, ans)
print('--------------------Super Learner--------------------')  #test 78.85%
print('Precision:', '%.6f' % precision_score(y_test, ans))
print('Recall:', '%.6f' % recall_score(y_test, ans))
fpr, tpr, thresholds = roc_curve(y_test, ans)
print('AUC:', '%.6f' % auc(fpr, tpr))
'''ensemble SL2'''
#seed = 2018
#np.random.seed(seed)
#ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)
#ensemble.add([ExtraTreesClassifier(n_estimators=30,random_state=seed),AdaBoostClassifier(n_estimators=100)])
#ensemble.add_meta(SVC())
#ensemble.fit(X_train,y_train)
Example #23
0
#evaluateSecondLayer(base_learners, x_train, y_train, meta_learners, param_dicts)   """

########################################## Create and Train Ensembler ##################################################

ensemble = SuperLearner(folds=4)

print("adding baseline models to ensembler")

ensemble.add([
    XGBRegressor(**getXGBParams(y_train)),
    LGBMRegressor(**params_1),
    MultiCatBoost(catboost_params, cat_feature_inds)
])

ensemble.add_meta(XGBRegressor(**xgb_params_2))

print("training ensembler")
ensemble.fit(x_train, y_train)

######################################### PREDICTING ON ENSEMBLE #######################################################

print("predicting on ensembler")
preds = ensemble.predict(X_test)
""""#Validation prediction:

preds = ensemble.predict(x_val)
accuracy = mean_absolute_error(y_val, preds)
print('validation accuracy: ')
print(accuracy) """
Example #24
0
    'sc': [StandardScaler()],
    'sub': [Subset([0, 1])],
    # 'pca': [PCA()]
}

rpars = {
    'alpha': np.linspace(-2, 2, 10),
    'normalize': [True, False],
}

params = {
    # 'las': pars,
    'rdg': rpars,
}

ensemble = SuperLearner(scorer=r2_score)
ensemble.add(estimators=ests1)
ensemble.add_meta(LinearRegression())

# ensemble.fit(X_train, y_train)
#
# ensemble.add(ests2)
#
# ensemble.fit(X=X_train, y=y_train)
#
# y_pred = np.array(ensemble.predict(X_test))
# y_test = np.array(y_test)

# score = r2_score(y_test, y_pred)
# print(score)
Example #25
0
def main():
    # Open and read in train x, train y, and scaled test data
    with open('AviationData_cleaned_V3.csv', 'r') as input_all:
        df_raw = pd.read_csv(input_all, encoding = 'utf-8')
    
    # Final check on NA values from 
    print('Check number of NA values from selected columns:\n',
          df_raw.isnull().sum())
    
    # Drop rows containing NA values and reset index
    df_raw.dropna(axis=0, inplace = True)
    df_raw.reset_index(drop = True, inplace = True)
    
    # Prepare response label
    df_raw['Injury Severity']= df_raw['Injury Severity'].replace('Incident', 'Non-Fatal') 

    # Separate the two classes in the original dataset
    df_none = df_raw.loc[df_raw['Injury Severity'] == 'Non-Fatal']
    df_fatl = df_raw.loc[df_raw['Injury Severity'] == 'Fatal']
    
    # Balance Dataset
    n_fatl = len(df_fatl)
    df_none = df_none.sample(n = n_fatl, replace = False, random_state = 117)
    
    # Re-construct dataset
    df_sampled = pd.concat([df_none,df_fatl], ignore_index=True)
    df_sampled.reset_index(drop = True, inplace = True)

    # Separate predictors and response
    df_X = df_sampled.drop(['Injury Severity', 'Airport Code'], axis = 1)
    df_y = df_sampled.loc[: ,  'Injury Severity' ]
    
    # Convert string response to numerical response fro convenience
    df_y.replace('Non-Fatal', '0', inplace = True)
    df_y.replace('Fatal', '1', inplace = True)
    
    # Define and apply one-hot encoder to encode predictors
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df_X)
    df_X = pd.DataFrame(enc.transform(df_X).toarray(), columns = enc.get_feature_names(list(df_X.columns)))
    
    # Separate train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.5, random_state=1378)
    
    # Recude dataset dimension
    #X_train, X_test = dimension_reduction(X_train, y_train, X_test, 80 , method = 'PCA')
  
    # Define MLP classifier
    clf_mlp = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', 
                            alpha=0.0001, batch_size='auto', learning_rate='constant', 
                            learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                            random_state=117, tol=0.0001, verbose=False, warm_start=False, 
                            momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                            validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                            n_iter_no_change=10)
    
    # Define XGBoost classifier
    clf_xgb = xgb.XGBClassifier(booster='gbtree',
                               objective= 'binary:logistic',
                               eval_metric='logloss',
                               tree_method= 'auto',
                               max_depth= 6,
                               min_child_weight= 1,
                               gamma = 0,
                               subsample= 1,
                               colsample_bytree = 1,
                               reg_alpha = 0,
                               reg_lambda = 1,
                               learning_rate = 0.1,
                               seed=27)
    
    # Define LGB Classifier
    clf_lgb = lgb.LGBMClassifier(objective = 'binary',
                                    boosting = 'gbdt',
                                    metric = 'binary_logloss',
                                    num_leaves = 15,
                                    min_data_in_leaf = 10,
                                    max_depth = 5,
                                    bagging_fraction = 0.85,
                                    bagging_freq = 11,
                                    feature_fraction = 0.5,
                                    lambda_l1 = 0.01,
                                    lambda_l2 = 0.3,
                                    num_iterations = 100,
                                    learning_rate = 0.08,
                                    random_state = 117)
    
    # Define random forest classifier
    clf_rf = RandomForestClassifier(n_estimators=300, criterion='gini', 
                                    max_depth=None, min_samples_split=2, 
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                    max_features='auto', random_state = 117)
    
    
    # Fit base learners using whole train dataset
    clf_mlp.fit(X_train,y_train)
    clf_xgb.fit(X_train,y_train)
    clf_lgb.fit(X_train,y_train)
    clf_rf.fit(X_train,y_train)
    
    # Generate predicted probability using base learners
    mlp_proba = clf_mlp.predict_proba(X_test)[:, 1]
    xgb_proba = clf_xgb.predict_proba(X_test)[:, 1]
    lgb_proba = clf_lgb.predict_proba(X_test)[:, 1]
    rf_proba = clf_lgb.predict_proba(X_test)[:, 1]
    
    # Initialize prediction using base learners' results
    pred_mlp = pd.Series(np.full(len(y_test), 0)) 
    pred_xgb = pd.Series(np.full(len(y_test), 0)) 
    pred_lgb = pd.Series(np.full(len(y_test), 0)) 
    pred_rf = pd.Series(np.full(len(y_test), 0)) 
    
    # Set threshold
    thres_mlp = 0.5
    thres_xgb = 0.5
    thres_lgb = 0.5
    thres_rf = 0.5
    
    # Make final prediction
    pred_mlp[mlp_proba >= thres_mlp] = 1
    pred_xgb[xgb_proba >= thres_xgb] = 1
    pred_lgb[lgb_proba >= thres_lgb] = 1
    pred_rf[rf_proba >= thres_rf] = 1
    
    # Map test data response into integers
    y_test = list(map(int, y_test))
    
    # Generate prediction report using base learners
    print('\n\nMLP:')
    print_validate(y_test, pred_mlp)
    
    print('\n\nXGB:')
    print_validate(y_test, pred_xgb)
    
    print('\n\nLGB:')
    print_validate(y_test, pred_lgb)
    
    print('\n\nRF:')
    print_validate(y_test, pred_rf)
    
    # Set base learner dictionary
    base_learners = {'mlp': clf_mlp,
                    'xgb': clf_xgb,
                    'lgb' : clf_lgb,
                    'rf': clf_rf
                    }
    
    # Define super learner
    sup_learner = SuperLearner(
                random_state=117
                )
    
    # Add the base learners and the meta learner
    sup_learner.add(list(base_learners.values()), proba = True)
    sup_learner.add_meta(linear_model.BayesianRidge(alpha_1 = 1e-3))
    
    # Train the ensemble
    sup_learner.fit(X_train,y_train)
    
    # Make prediction using super learner
    sl_proba = sup_learner.predict_proba(X_test)
    pred_sl = pd.Series(np.full(len(y_test), 0)) 
    thres_sl = 0.5
    pred_sl[sl_proba >= thres_sl] = 1
    
    print('\n\nSL:')
    print_validate(y_test, pred_sl)
    
    # ROC Curves for test dataset
    plt.figure(figsize=(8,7))
    draw_roc(y_test, sl_proba, 'Super Learner', 'tab:cyan', '-')
    draw_roc(y_test, mlp_proba, 'MLP NN', 'royalblue', '-')
    draw_roc(y_test, xgb_proba, 'XGBoost', 'lightcoral', '--')
    draw_roc(y_test, lgb_proba, 'LightGBM', 'seagreen', '-.')
    draw_roc(y_test, rf_proba, 'Random Forest', 'darkorange', '-')
    
    plt.plot([0, 1], [0, 1], 'k--', lw = 4)
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.02])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Test Result')
    plt.legend(loc="lower right", fontsize = 14, handlelength=4)
    plt.show()
Example #26
0
WIDTH = 2
MOD = 2

data1 = Data('stack', False, True, FOLDS)
X1, y1 = data1.get_data((LEN, WIDTH), MOD)
(F1, wf1), (P1, wp1) = data1.ground_truth(X1, y1, 1, False)
G1 = OLS().fit(F1, y1).predict(P1)

data2 = Data('stack', False, False, FOLDS)
X2, y2 = data1.get_data((LEN, WIDTH), MOD)
(F2, wf2), (P2, wp2) = data2.ground_truth(X2, y2, 1, False)
G2 = OLS().fit(F2, y2).predict(P2)

ens1 = SuperLearner(folds=FOLDS, scorer=rmse, verbose=100)
ens1.add(ESTIMATORS, PREPROCESSING, dtype=np.float64)
ens1.add_meta(OLS(), dtype=np.float64)

ens1_b = SuperLearner(folds=FOLDS, scorer=in_script_func)
ens1_b.add(ESTIMATORS, PREPROCESSING, dtype=np.float64)
ens1_b.add_meta(OLS(), dtype=np.float64)

ens2 = SuperLearner(folds=FOLDS, scorer=rmse, verbose=100)
ens2.add(ECM, dtype=np.float64)
ens2.add_meta(OLS(), dtype=np.float64)

ens2_b = SuperLearner(folds=FOLDS, scorer=in_script_func)
ens2_b.add(ECM, dtype=np.float64)
ens2_b.add_meta(OLS(), dtype=np.float64)

ens_f = SuperLearner(folds=FOLDS, scorer=fail_func)
ens_f.add(ECM, dtype=np.float64)
Example #27
0
class StackedEnsembleClassifier(_MLensAdapter):
    """Ensemble of stacked classifiers, meaning that classifiers are arranged in layers
    with the next layer getting as input the output of the last layer.
    The predictions of the final layer are merged with a meta-learner (the same happens for
    ~:class:`soweego.linker.GatedEnsembleClassifier`), which decides the final
    output based on the prediction of the base classifiers.

    This classifier uses :class:`mlens.ensemble.SuperLearner`
    to implement the *stacking* functionality.

    The parameters, and their default values, are:

    - **meta_layer**: Name of the classifier to use as a *meta layer*. By
        default this is `single_layer_perceptron`
    - **folds**: The number of folds to use for cross validation when
        generating the training set for the **meta_layer**. The default
        value for this is `2`.

        For a better explanation of this parameter, see:

        *Polley, Eric C.
        and van der Laan, Mark J., “Super Learner In Prediction” (May 2010).
        U.C. Berkeley Division of Biostatistics Working Paper Series.
        Working Paper 266*
        `<https://biostats.bepress.com/ucbbiostat/paper266/>`_

    """

    def __init__(self, num_features, **kwargs):
        super(StackedEnsembleClassifier, self).__init__()

        kwargs = {**constants.STACKED_ENSEMBLE_PARAMS, **kwargs}

        self.num_features = num_features
        self.num_folds = kwargs.pop('folds', 2)
        self.meta_layer = kwargs.pop('meta_layer')

        def init_estimators(num_features):
            estimators = []
            for clf in constants.CLASSIFIERS_FOR_ENSEMBLE:
                model = utils.init_model(
                    clf, num_features=num_features, **kwargs
                )

                estimators.append((clf, model.kernel))
            return estimators

        self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds)

        l1_estimators = init_estimators(self.num_features)
        self.kernel.add(l1_estimators, proba=True)

        l2_estimators = init_estimators(len(l1_estimators) * self.num_folds)
        self.kernel.add(l2_estimators, proba=True)

        self.kernel.add_meta(
            utils.init_model(
                self.meta_layer, len(l2_estimators) * self.num_folds, **kwargs
            ).kernel,
            proba=True,
        )

    def __repr__(self):
        return (
            f'{self.__class__.__name__}('
            f'num_folds={self.num_folds}, '
            f'meta_layer={self.meta_layer}) '
        )
Example #28
0
    P_pred, p = ensemble_predict(cv_base_learners,
                                 cv_meta_learner,
                                 xtest,
                                 verbose=False)
    print("\nEnsemble (Stacking) ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p))

    # Instantiate the ensemble with 10 folds
    ensemble = SuperLearner(folds=10,
                            random_state=SEED,
                            verbose=2,
                            backend="multiprocessing")

    # Add the base learners and the meta learner
    ensemble.add(list(base_learners.values()), proba=True)
    ensemble.add_meta(meta_learner, proba=True)

    # Train the ensemble
    ensemble.fit(xtrain, ytrain)

    # Predict the test set
    p_sl = ensemble.predict_proba(xtest)

    print("\nSuper Learner ROC-AUC score: %.3f" %
          roc_auc_score(ytest, p_sl[:, 1]))
    plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"],
                   "Super Learner")

    print('-------------------------------------')
    print(test.head())
    y_pred = ensemble.predict(test.iloc[:, 1:].values)
                        backend="multiprocessing")
ensemble.add([
    RandomForestClassifier(random_state=seed, n_estimators=250),
    SVC(),
    LassoLarsIC(criterion='bic'),
    ElasticNet(random_state=0),
    BayesianRidge(),
    MLPClassifier(),
    BaggingClassifier(),
    neighbors.KNeighborsClassifier(),
    tree.DecisionTreeClassifier(),
    GradientBoostingClassifier(n_estimators=200)
])

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression())

ensemble.fit(x_train, y_train)
preds = ensemble.predict(x_test)

ensemble_data = pd.DataFrame(ensemble.data)
auroc = roc_auc_score(preds, y_test)
acc = accuracy_score(preds, y_test)

p = precision_score(preds, y_test)
r = recall_score(preds, y_test)

frp, tpr, threshholds = roc_curve(preds, y_test)

fig = plt.figure()
plt.plot(frp, tpr)
Example #30
0
def get_super_learner(X):		# create the super learner
    ensemble = SuperLearner(scorer=accuracy_score, folds=3,
                            shuffle=True, verbose=True, sample_size=len(X))
    ensemble.add(cfg.sl_models)         # add base models
    ensemble.add_meta(cfg.sl_meta)      # add meta model
    return ensemble