def build_ensemble(incl_meta, proba, propagate_features=[0, 1]): """Return an ensemble.""" if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None #change here estimators_layer1 = [xgb] estimators_layer2 = [lgb] # estimators_layer3 = [rf,et ...........] ensemble = SuperLearner() ensemble.add(estimators_layer1, proba=proba, propagate_features=propagate_features) # ensemble.add(estimators_layer2, proba=proba, propagate_features=propagate_features) ensemble.add(estimators_layer2, proba=proba) if incl_meta: ensemble.add_meta(lr) return ensemble
def get_super_learner(): base_learners = [elastic_net, xgboost, light_gbm] meta_learner = LinearRegression(fit_intercept=False) ensemble = SuperLearner(folds=2, shuffle=False) ensemble.add(base_learners) ensemble.add_meta(meta_learner) return ensemble
def get_stacked_model(X, y, is_processing=True): ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) preprocessers = [StandardScaler()] if is_processing else [] ensemble.add([MyClassifier(5.0)], preprocessing=preprocessers) ensemble.add_meta(MyClassifier(0.5)) ensemble.fit(X, y) return ensemble
def build_ensemble(incl_meta, meta_type='log', preprocessors=None, estimators=None, propagate_features=None): if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None if not estimators: estimators = [('rfr', RandomForestRegressor(random_state=seed)), ('svr', SVR()), ('rdg', Ridge())] ensemble = SuperLearner() ensemble.add(estimators, propagate_features=propagate_features_1) ensemble.add(estimators, propagate_features=propagate_features_2) if incl_meta & meta_type == 'log': ensemble.add_meta(LogisticRegression()) elif incl_meta & meta_type == 'lin': ensemble.add_meta(LinearRegression()) return ensemble
def get_ensemble(): sl = SuperLearner(folds=10, random_state=seed, verbose=2, backend='multiprocessing') sl.add(list(get_models().values()), proba=True) sl.add_meta(get_meta(), proba=True) return sl
def get_super_learner(X): ensemble = SuperLearner(scorer=rmse, folds=2, shuffle=True, sample_size=len(X)) # Add base models models = get_models() ensemble.add(models) # Add the meta model ensemble.add_meta(LinearRegression()) return ensemble
class GatedEnsembleClassifier(_MLensAdapter): """Ensemble of classifiers, whose predictions are joined by using a further meta-learner, which decides the final output based on the prediction of the base classifiers. This classifier uses :class:`mlens.ensemble.SuperLearner` to implement the *gating* functionality. The parameters, and their default values, are: - **meta_layer**: Name of the classifier to use as a *meta layer*. By default this is `single_layer_perceptron` - **folds**: The number of folds to use for cross validation when generating the training set for the **meta_layer**. The default value for this is `2`. For a better explanation of this parameter, see: *Polley, Eric C. and van der Laan, Mark J., “Super Learner In Prediction” (May 2010). U.C. Berkeley Division of Biostatistics Working Paper Series. Working Paper 266* `<https://biostats.bepress.com/ucbbiostat/paper266/>`_ """ def __init__(self, num_features, **kwargs): super(GatedEnsembleClassifier, self).__init__() kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model(clf, num_features=self.num_features, **kwargs) estimators.append((clf, model.kernel)) self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) # use as output the probability of a given class (not just # the class itself) self.kernel.add(estimators, proba=True) self.kernel.add_meta( utils.init_model(self.meta_layer, len(estimators) * self.num_folds, **kwargs).kernel, proba=True, ) def __repr__(self): return (f'{self.__class__.__name__}(' f'num_folds={self.num_folds}, ' f'meta_layer={self.meta_layer}) ')
def simple_statistic(comb): resres=[] for train, test in tqdm(list(sfolder.split(data_x,data_y))): # break cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets'] # stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first'] X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:] Y_train, Y_test = data_y[train], data_y[test] x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42) smo=SMOTE(random_state=42,ratio={1:2000}) x_train_s,y_train_s=smo.fit_sample(x_train,y_train) ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns) X_train_s=pd.concat([x_train_s,x_val],axis=0) Y_train_s=list(y_train_s) Y_train_s.extend(list(y_val)) Y_train_s=np.array(Y_train_s) best_combination_nowfold=comb for sts in best_combination_nowfold: for column in x_train.columns: if(sts == column.split('_')[0]): cofff.append(column) x_train_train=X_train_s[cofff] y_train_train=Y_train_s x_test=X_test[cofff] y_test=Y_test ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing") ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True) ensemble.add_meta(LogisticRegression(),proba=True) print('now is here -4\n') ensemble.fit(x_train_train,y_train_train) print('now is here -5\n') preds_prob=ensemble.predict_proba(x_test) print('now is here -6\n') prob=preds_prob[:, 1] preds=[] for i in prob: if i>=0.5: preds.append(1); else: preds.append(0) auc_sl=roc_auc_score(y_test,preds_prob[:,1]) auprc_sl=average_precision_score(y_test,preds_prob[:,1]) recall_sl=recall_score(y_test,preds) acc_sl=accuracy_score(y_test,preds) p_sl=precision_score(y_test,preds) f1_sl=f1_score(y_test,preds) fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob) print('now is here -7') resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl]) return resres
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=f1, random_state=seed) ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('f1-score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def get_super_learner(X): ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X)) # add base models models = get_models() ensemble.add(models) # add the meta model ensemble.add_meta(LogisticRegression(solver='lbfgs')) return ensemble
def stacking_training (X,y,X_pred,layer_list,meta_learner): stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False) for each in layer_list: stacking_in_layer.add(each,proba=True) print ('基学习器添加成功') stacking_in_layer.add_meta(meta_learner,proba= True) print ('元学习器添加成功') print ('拟合中') stacking_in_layer.fit(X,y) pred_proba = stacking_in_layer.predict_proba(X_pred) return pred_proba,stacking_in_layer
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=accuracy, random_state=seed) # call predict_proba instead of predict ensemble.add( [SVC(probability=True), RandomForestClassifier(random_state=seed)], proba=True) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('accuracy score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def esemble(data,data2,data5,during): ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2) ensemble.add(linear_model.LinearRegression()) ensemble.add_meta([GaussianProcessRegressor()]) y = data2['prmom'+during+'_f'] x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1) x=x.fillna(0) y=np.array(y) x=np.array(x) ensemble.fit(x,y) X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1) X=X.fillna(0) X=np.array(X) preds = ensemble.predict(X) data['pred_essemble']=preds return data
def use_pack(): sl = SuperLearner( folds=10, random_state=SEED, verbose=2, # backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
def build_ensemble(incl_meta, propagate_features=None): if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None estimators = [RandomForestRegressor(random_state=seed), SVR()] ensemble = SuperLearner() ensemble.add(estimators, propagate_features=propagate_features_1) ensemble.add(estimators, propagate_features=propagate_features_2) if incl_meta: ensemble.add_meta(LogisticRegression()) return ensemble
def get_model(param: dict) -> BaseEstimator: model_name = param.pop('name') if model_name == 'xgb': return XGBRegressor(**param[model_name]) elif model_name == 'lgb': return LGBMRegressor(**param[model_name]) elif model_name == 'cb': return CatBoostRegressor(**param[model_name]) elif model_name == 'rf': return RandomForestRegressor(**param[model_name]) elif model_name == 'svm': return make_pipeline(StandardScaler(), SVR(**param[model_name])) elif model_name == 'knn': return make_pipeline(StandardScaler(), KNeighborsRegressor(**param[model_name])) elif model_name == 'mlp': return make_pipeline(StandardScaler(), MLPRegressor(**param[model_name])) elif model_name == 'vote': return VotingRegressor(estimators=[ ('svm', get_model(dict(param, name='svm'))), ('rf', get_model(dict(param, name='rf'))), ('lgb', get_model(dict(param, name='lgb'))), ('knn', get_model(dict(param, name='knn'))), ]) elif model_name == 'stack': model = SuperLearner(scorer=mean_squared_error, random_state=132) model.add([ get_model(dict(param, name='svm')), get_model(dict(param, name='rf')), get_model(dict(param, name='lgb')), get_model(dict(param, name='knn')), ]) model.add_meta(GradientBoostingRegressor(random_state=22)) return model elif model_name == 'sk_stack': return StackingRegressor( estimators=[ ('svm', get_model(dict(param, name='svm'))), ('rf', get_model(dict(param, name='rf'))), ('lgb', get_model(dict(param, name='lgb'))), ('knn', get_model(dict(param, name='knn'))), ], final_estimator=GradientBoostingRegressor(random_state=42) )
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test): all_objects = [ "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork", "Flask", "Bowl" ] ensemble = SuperLearner(folds=10, random_state=seed, verbose=2, backend="multiprocessing", scorer=accuracy_score) layer_1 = [SVC(kernel='linear', C=8)] ensemble.add(layer_1) # 95.50 """Make plots of learning curve""" ensemble.add_meta( AdaBoostClassifier( DecisionTreeClassifier(max_depth=8, min_samples_split=5, min_samples_leaf=8))) ensemble.fit(X_train, y_train) import time start = time.time() yhat = ensemble.predict(X_test) accuracies = cross_val_score(ensemble, X_test, y_test, cv=10, scoring="accuracy") print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100)) print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() * 100))
def do_stacking_simple_models(regressors, X, y, w, meta): """ do stacking witht the mlens library. :param regressors: a dict of regressors to feed into the ensemble pipeline :param X: training dataset :param y: outcome varaible y :param w: assignment variable :param meta: regressor (found in regressors dict for ensemble) :return: CATE predictions from the ensemble estimator """ ensemble = SuperLearner(scorer=mean_squared_error, random_state=42) ensemble.add([x for x in regressors.values()]) ensemble.add_meta(regressors[meta]) e_preds, tau_test = simple_model.create_simple_ml_model(X, y, w, ensemble) return e_preds
def train_model(ensemble, X, y) : seed = 2017 np.random.seed(seed) # --- Build --- # Passing a scoring function will create cv scores during fitting # the scorer should be a simple function accepting to vectors and returning a scalar ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer # ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add([IsolationForest(), LOF(novelty=True)]) # Attach the final meta estimator # ensemble.add_meta(LogisticRegression()) ensemble.add_meta(OCSVM()) # Fit ensemble ensemble.fit(X, y)
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test): # Establish and reset variables acc_score_cv = None acc_score = None time_ = None ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) ensemble.add(models) # Attach the final meta estimator ensemble.add_meta(SVC()) start = time.time() ensemble.fit(X_train, Y_train) preds = ensemble.predict(X_test) acc_score = accuracy_score(preds, Y_test) end = time.time() time_ = end - start return { "Ensemble": name, "Meta_Classifier": "SVC", "Accuracy_Score": acc_score, "Runtime": time_ }
cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2)) P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False) print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # 0.881 ## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升! # Instantiate the ensemble with 10 folds sl = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner", 'ROC_curve_with_super_learning') # 0.890
print('Precision:', '%.6f' % precision_score(y_test, ans)) print('Recall:', '%.6f' % recall_score(y_test, ans)) fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) #-------------------------------------------------------------------------------------------------# '''ensemble SL1''' seed = 2018 np.random.seed(seed) ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) ensemble.add([ ExtraTreesClassifier(n_estimators=25, random_state=seed), KNeighborsClassifier(n_neighbors=2), AdaBoostClassifier(n_estimators=100) ]) ensemble.add_meta(SVC()) ensemble.fit(X_train, y_train) ans = ensemble.predict(X_test) FP, FN, TP, TN = conf_matrix(y_test, ans) print('--------------------Super Learner--------------------') #test 78.85% print('Precision:', '%.6f' % precision_score(y_test, ans)) print('Recall:', '%.6f' % recall_score(y_test, ans)) fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) '''ensemble SL2''' #seed = 2018 #np.random.seed(seed) #ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) #ensemble.add([ExtraTreesClassifier(n_estimators=30,random_state=seed),AdaBoostClassifier(n_estimators=100)]) #ensemble.add_meta(SVC()) #ensemble.fit(X_train,y_train)
#evaluateSecondLayer(base_learners, x_train, y_train, meta_learners, param_dicts) """ ########################################## Create and Train Ensembler ################################################## ensemble = SuperLearner(folds=4) print("adding baseline models to ensembler") ensemble.add([ XGBRegressor(**getXGBParams(y_train)), LGBMRegressor(**params_1), MultiCatBoost(catboost_params, cat_feature_inds) ]) ensemble.add_meta(XGBRegressor(**xgb_params_2)) print("training ensembler") ensemble.fit(x_train, y_train) ######################################### PREDICTING ON ENSEMBLE ####################################################### print("predicting on ensembler") preds = ensemble.predict(X_test) """"#Validation prediction: preds = ensemble.predict(x_val) accuracy = mean_absolute_error(y_val, preds) print('validation accuracy: ') print(accuracy) """
'sc': [StandardScaler()], 'sub': [Subset([0, 1])], # 'pca': [PCA()] } rpars = { 'alpha': np.linspace(-2, 2, 10), 'normalize': [True, False], } params = { # 'las': pars, 'rdg': rpars, } ensemble = SuperLearner(scorer=r2_score) ensemble.add(estimators=ests1) ensemble.add_meta(LinearRegression()) # ensemble.fit(X_train, y_train) # # ensemble.add(ests2) # # ensemble.fit(X=X_train, y=y_train) # # y_pred = np.array(ensemble.predict(X_test)) # y_test = np.array(y_test) # score = r2_score(y_test, y_pred) # print(score)
def main(): # Open and read in train x, train y, and scaled test data with open('AviationData_cleaned_V3.csv', 'r') as input_all: df_raw = pd.read_csv(input_all, encoding = 'utf-8') # Final check on NA values from print('Check number of NA values from selected columns:\n', df_raw.isnull().sum()) # Drop rows containing NA values and reset index df_raw.dropna(axis=0, inplace = True) df_raw.reset_index(drop = True, inplace = True) # Prepare response label df_raw['Injury Severity']= df_raw['Injury Severity'].replace('Incident', 'Non-Fatal') # Separate the two classes in the original dataset df_none = df_raw.loc[df_raw['Injury Severity'] == 'Non-Fatal'] df_fatl = df_raw.loc[df_raw['Injury Severity'] == 'Fatal'] # Balance Dataset n_fatl = len(df_fatl) df_none = df_none.sample(n = n_fatl, replace = False, random_state = 117) # Re-construct dataset df_sampled = pd.concat([df_none,df_fatl], ignore_index=True) df_sampled.reset_index(drop = True, inplace = True) # Separate predictors and response df_X = df_sampled.drop(['Injury Severity', 'Airport Code'], axis = 1) df_y = df_sampled.loc[: , 'Injury Severity' ] # Convert string response to numerical response fro convenience df_y.replace('Non-Fatal', '0', inplace = True) df_y.replace('Fatal', '1', inplace = True) # Define and apply one-hot encoder to encode predictors enc = OneHotEncoder(handle_unknown='ignore') enc.fit(df_X) df_X = pd.DataFrame(enc.transform(df_X).toarray(), columns = enc.get_feature_names(list(df_X.columns))) # Separate train and test dataset X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.5, random_state=1378) # Recude dataset dimension #X_train, X_test = dimension_reduction(X_train, y_train, X_test, 80 , method = 'PCA') # Define MLP classifier clf_mlp = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=117, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) # Define XGBoost classifier clf_xgb = xgb.XGBClassifier(booster='gbtree', objective= 'binary:logistic', eval_metric='logloss', tree_method= 'auto', max_depth= 6, min_child_weight= 1, gamma = 0, subsample= 1, colsample_bytree = 1, reg_alpha = 0, reg_lambda = 1, learning_rate = 0.1, seed=27) # Define LGB Classifier clf_lgb = lgb.LGBMClassifier(objective = 'binary', boosting = 'gbdt', metric = 'binary_logloss', num_leaves = 15, min_data_in_leaf = 10, max_depth = 5, bagging_fraction = 0.85, bagging_freq = 11, feature_fraction = 0.5, lambda_l1 = 0.01, lambda_l2 = 0.3, num_iterations = 100, learning_rate = 0.08, random_state = 117) # Define random forest classifier clf_rf = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state = 117) # Fit base learners using whole train dataset clf_mlp.fit(X_train,y_train) clf_xgb.fit(X_train,y_train) clf_lgb.fit(X_train,y_train) clf_rf.fit(X_train,y_train) # Generate predicted probability using base learners mlp_proba = clf_mlp.predict_proba(X_test)[:, 1] xgb_proba = clf_xgb.predict_proba(X_test)[:, 1] lgb_proba = clf_lgb.predict_proba(X_test)[:, 1] rf_proba = clf_lgb.predict_proba(X_test)[:, 1] # Initialize prediction using base learners' results pred_mlp = pd.Series(np.full(len(y_test), 0)) pred_xgb = pd.Series(np.full(len(y_test), 0)) pred_lgb = pd.Series(np.full(len(y_test), 0)) pred_rf = pd.Series(np.full(len(y_test), 0)) # Set threshold thres_mlp = 0.5 thres_xgb = 0.5 thres_lgb = 0.5 thres_rf = 0.5 # Make final prediction pred_mlp[mlp_proba >= thres_mlp] = 1 pred_xgb[xgb_proba >= thres_xgb] = 1 pred_lgb[lgb_proba >= thres_lgb] = 1 pred_rf[rf_proba >= thres_rf] = 1 # Map test data response into integers y_test = list(map(int, y_test)) # Generate prediction report using base learners print('\n\nMLP:') print_validate(y_test, pred_mlp) print('\n\nXGB:') print_validate(y_test, pred_xgb) print('\n\nLGB:') print_validate(y_test, pred_lgb) print('\n\nRF:') print_validate(y_test, pred_rf) # Set base learner dictionary base_learners = {'mlp': clf_mlp, 'xgb': clf_xgb, 'lgb' : clf_lgb, 'rf': clf_rf } # Define super learner sup_learner = SuperLearner( random_state=117 ) # Add the base learners and the meta learner sup_learner.add(list(base_learners.values()), proba = True) sup_learner.add_meta(linear_model.BayesianRidge(alpha_1 = 1e-3)) # Train the ensemble sup_learner.fit(X_train,y_train) # Make prediction using super learner sl_proba = sup_learner.predict_proba(X_test) pred_sl = pd.Series(np.full(len(y_test), 0)) thres_sl = 0.5 pred_sl[sl_proba >= thres_sl] = 1 print('\n\nSL:') print_validate(y_test, pred_sl) # ROC Curves for test dataset plt.figure(figsize=(8,7)) draw_roc(y_test, sl_proba, 'Super Learner', 'tab:cyan', '-') draw_roc(y_test, mlp_proba, 'MLP NN', 'royalblue', '-') draw_roc(y_test, xgb_proba, 'XGBoost', 'lightcoral', '--') draw_roc(y_test, lgb_proba, 'LightGBM', 'seagreen', '-.') draw_roc(y_test, rf_proba, 'Random Forest', 'darkorange', '-') plt.plot([0, 1], [0, 1], 'k--', lw = 4) plt.xlim([-0.02, 1.0]) plt.ylim([0.0, 1.02]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for Test Result') plt.legend(loc="lower right", fontsize = 14, handlelength=4) plt.show()
WIDTH = 2 MOD = 2 data1 = Data('stack', False, True, FOLDS) X1, y1 = data1.get_data((LEN, WIDTH), MOD) (F1, wf1), (P1, wp1) = data1.ground_truth(X1, y1, 1, False) G1 = OLS().fit(F1, y1).predict(P1) data2 = Data('stack', False, False, FOLDS) X2, y2 = data1.get_data((LEN, WIDTH), MOD) (F2, wf2), (P2, wp2) = data2.ground_truth(X2, y2, 1, False) G2 = OLS().fit(F2, y2).predict(P2) ens1 = SuperLearner(folds=FOLDS, scorer=rmse, verbose=100) ens1.add(ESTIMATORS, PREPROCESSING, dtype=np.float64) ens1.add_meta(OLS(), dtype=np.float64) ens1_b = SuperLearner(folds=FOLDS, scorer=in_script_func) ens1_b.add(ESTIMATORS, PREPROCESSING, dtype=np.float64) ens1_b.add_meta(OLS(), dtype=np.float64) ens2 = SuperLearner(folds=FOLDS, scorer=rmse, verbose=100) ens2.add(ECM, dtype=np.float64) ens2.add_meta(OLS(), dtype=np.float64) ens2_b = SuperLearner(folds=FOLDS, scorer=in_script_func) ens2_b.add(ECM, dtype=np.float64) ens2_b.add_meta(OLS(), dtype=np.float64) ens_f = SuperLearner(folds=FOLDS, scorer=fail_func) ens_f.add(ECM, dtype=np.float64)
class StackedEnsembleClassifier(_MLensAdapter): """Ensemble of stacked classifiers, meaning that classifiers are arranged in layers with the next layer getting as input the output of the last layer. The predictions of the final layer are merged with a meta-learner (the same happens for ~:class:`soweego.linker.GatedEnsembleClassifier`), which decides the final output based on the prediction of the base classifiers. This classifier uses :class:`mlens.ensemble.SuperLearner` to implement the *stacking* functionality. The parameters, and their default values, are: - **meta_layer**: Name of the classifier to use as a *meta layer*. By default this is `single_layer_perceptron` - **folds**: The number of folds to use for cross validation when generating the training set for the **meta_layer**. The default value for this is `2`. For a better explanation of this parameter, see: *Polley, Eric C. and van der Laan, Mark J., “Super Learner In Prediction” (May 2010). U.C. Berkeley Division of Biostatistics Working Paper Series. Working Paper 266* `<https://biostats.bepress.com/ucbbiostat/paper266/>`_ """ def __init__(self, num_features, **kwargs): super(StackedEnsembleClassifier, self).__init__() kwargs = {**constants.STACKED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') def init_estimators(num_features): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=num_features, **kwargs ) estimators.append((clf, model.kernel)) return estimators self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) l1_estimators = init_estimators(self.num_features) self.kernel.add(l1_estimators, proba=True) l2_estimators = init_estimators(len(l1_estimators) * self.num_folds) self.kernel.add(l2_estimators, proba=True) self.kernel.add_meta( utils.init_model( self.meta_layer, len(l2_estimators) * self.num_folds, **kwargs ).kernel, proba=True, ) def __repr__(self): return ( f'{self.__class__.__name__}(' f'num_folds={self.num_folds}, ' f'meta_layer={self.meta_layer}) ' )
P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False) print("\nEnsemble (Stacking) ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # Instantiate the ensemble with 10 folds ensemble = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner ensemble.add(list(base_learners.values()), proba=True) ensemble.add_meta(meta_learner, proba=True) # Train the ensemble ensemble.fit(xtrain, ytrain) # Predict the test set p_sl = ensemble.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner") print('-------------------------------------') print(test.head()) y_pred = ensemble.predict(test.iloc[:, 1:].values)
backend="multiprocessing") ensemble.add([ RandomForestClassifier(random_state=seed, n_estimators=250), SVC(), LassoLarsIC(criterion='bic'), ElasticNet(random_state=0), BayesianRidge(), MLPClassifier(), BaggingClassifier(), neighbors.KNeighborsClassifier(), tree.DecisionTreeClassifier(), GradientBoostingClassifier(n_estimators=200) ]) # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) ensemble.fit(x_train, y_train) preds = ensemble.predict(x_test) ensemble_data = pd.DataFrame(ensemble.data) auroc = roc_auc_score(preds, y_test) acc = accuracy_score(preds, y_test) p = precision_score(preds, y_test) r = recall_score(preds, y_test) frp, tpr, threshholds = roc_curve(preds, y_test) fig = plt.figure() plt.plot(frp, tpr)
def get_super_learner(X): # create the super learner ensemble = SuperLearner(scorer=accuracy_score, folds=3, shuffle=True, verbose=True, sample_size=len(X)) ensemble.add(cfg.sl_models) # add base models ensemble.add_meta(cfg.sl_meta) # add meta model return ensemble