def esemble(data,data2,data5,during): ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2) ensemble.add(linear_model.LinearRegression()) ensemble.add_meta([GaussianProcessRegressor()]) y = data2['prmom'+during+'_f'] x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1) x=x.fillna(0) y=np.array(y) x=np.array(x) ensemble.fit(x,y) X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1) X=X.fillna(0) X=np.array(X) preds = ensemble.predict(X) data['pred_essemble']=preds return data
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test): all_objects = [ "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork", "Flask", "Bowl" ] ensemble = SuperLearner(folds=10, random_state=seed, verbose=2, backend="multiprocessing", scorer=accuracy_score) layer_1 = [SVC(kernel='linear', C=8)] ensemble.add(layer_1) # 95.50 """Make plots of learning curve""" ensemble.add_meta( AdaBoostClassifier( DecisionTreeClassifier(max_depth=8, min_samples_split=5, min_samples_leaf=8))) ensemble.fit(X_train, y_train) import time start = time.time() yhat = ensemble.predict(X_test) accuracies = cross_val_score(ensemble, X_test, y_test, cv=10, scoring="accuracy") print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100)) print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() * 100))
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test): # Establish and reset variables acc_score_cv = None acc_score = None time_ = None ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) ensemble.add(models) # Attach the final meta estimator ensemble.add_meta(SVC()) start = time.time() ensemble.fit(X_train, Y_train) preds = ensemble.predict(X_test) acc_score = accuracy_score(preds, Y_test) end = time.time() time_ = end - start return { "Ensemble": name, "Meta_Classifier": "SVC", "Accuracy_Score": acc_score, "Runtime": time_ }
SVC(), LassoLarsIC(criterion='bic'), ElasticNet(random_state=0), BayesianRidge(), MLPClassifier(), BaggingClassifier(), neighbors.KNeighborsClassifier(), tree.DecisionTreeClassifier(), GradientBoostingClassifier(n_estimators=200) ]) # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) ensemble.fit(x_train, y_train) preds = ensemble.predict(x_test) ensemble_data = pd.DataFrame(ensemble.data) auroc = roc_auc_score(preds, y_test) acc = accuracy_score(preds, y_test) p = precision_score(preds, y_test) r = recall_score(preds, y_test) frp, tpr, threshholds = roc_curve(preds, y_test) fig = plt.figure() plt.plot(frp, tpr) plt.show() ensemble_data.to_csv(
fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) #-------------------------------------------------------------------------------------------------# '''ensemble SL1''' seed = 2018 np.random.seed(seed) ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) ensemble.add([ ExtraTreesClassifier(n_estimators=25, random_state=seed), KNeighborsClassifier(n_neighbors=2), AdaBoostClassifier(n_estimators=100) ]) ensemble.add_meta(SVC()) ensemble.fit(X_train, y_train) ans = ensemble.predict(X_test) FP, FN, TP, TN = conf_matrix(y_test, ans) print('--------------------Super Learner--------------------') #test 78.85% print('Precision:', '%.6f' % precision_score(y_test, ans)) print('Recall:', '%.6f' % recall_score(y_test, ans)) fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) '''ensemble SL2''' #seed = 2018 #np.random.seed(seed) #ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) #ensemble.add([ExtraTreesClassifier(n_estimators=30,random_state=seed),AdaBoostClassifier(n_estimators=100)]) #ensemble.add_meta(SVC()) #ensemble.fit(X_train,y_train) #ans = ensemble.predict(X_test) #FP,FN,TP,TN = conf_matrix(y_test,ans)
seed = 2017 np.random.seed(seed) data = load_iris() idx = np.random.permutation(150) X = data.data[idx] y = data.target[idx] # Building an ensemble from mlens.ensemble import SuperLearner from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # --- Multi-layer ensembles --- ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) # Build first layer ensemble.add([RandomForestClassifier(random_state=seed), LogisticRegression()]) # Build the second layer ensemble.add([LogisticRegression(), SVC()]) # Attach final meta estimator ensemble.add_meta(SVC()) ensemble.fit(X[:75], y[:75]) preds = ensemble.predict(X[75:]) print("Fit data:\n%r" % ensemble.data)
# start counting time for training time_train_start = time.clock() # Fit ensemble ensemble.fit(training_data, training_labels) # print training time time_train_end = time.clock() print("Training finished, training time: %g seconds \n" % (time_train_end - time_train_start)) # start counting time for testing time_test_start = time.clock() # Predict preds = ensemble.predict(test_data) # print testing time time_test_end = time.clock() print("Testing finished, testing time: %g seconds \n" % (time_test_end - time_test_start)) print("Fit data:\n%r" % ensemble.data) print("Prediction score: %.6f" % accuracy_score(preds, test_labels)) # Predict preds_even = ensemble.predict(test_data_even) print("Fit data:\n%r" % ensemble.data)
xtest, verbose=False) print("\nEnsemble (Stacking) ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # Instantiate the ensemble with 10 folds ensemble = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner ensemble.add(list(base_learners.values()), proba=True) ensemble.add_meta(meta_learner, proba=True) # Train the ensemble ensemble.fit(xtrain, ytrain) # Predict the test set p_sl = ensemble.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner") print('-------------------------------------') print(test.head()) y_pred = ensemble.predict(test.iloc[:, 1:].values) print(y_pred)
print(yt) print(yv) Xt.fillna(-1) Xv.fillna(-1) yt.fillna(-1) yv.fillna(-1) print(Xt) ''' for clf in stacked_clf_list: ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, folds=10) ensemble.add(clf[0]) ensemble.add_meta(lr) ensemble.fit(Xt, yt) preds = ensemble.predict(Xv) accuracy = accuracy_score(preds, yv) if accuracy > best_combination[0]: best_combination[0] = accuracy best_combination[1] = clf[1] preds = ensemble.predict(X_test) best_preds = preds print(f"Accuracy score: {accuracy} {clf[1]}") print( f"\nBest stacking model is {best_combination[1]} with accuracy of: {best_combination[0]}" ) # Output print(best_preds) preds = best_preds.tolist()
num_in_layer = int(layer_weights[j] / weights_total * num_to_slot) layerlist = [] for k in range(num_in_layer): layerlist.append(eval_ind.pop()) ens.add(layerlist) # then add the meta model ens.add_meta(lgbm(n_estimators=1000, verbose=-1)) try: ens.fit(X_train, y_train) train_score = f1_score(ens.predict(X_train), y_train) test_score = f1_score(ens.predict(X_test), y_test) real_score = train_score * test_score print(' Training score is {}'.format(train_score)) print(' Testing score is {}'.format(test_score)) print(' Real score is {}'.format(real_score)) except: print(' There was an error with this one. Throwing it out') continue if real_score > highest_score: print(' New highest score found!') highest_score = real_score winning_model = ens ''' for its in range(5):
weight_dict = max_params['params'] final_weights = [] for model_idx in weight_dict.keys(): final_weights.append(weight_dict[model_idx]) ens.add_meta(VotingClassifier(voters_zipped, weights=final_weights)) print() print('Refitting the whole model with the new meta layer!') ens.fit(X_train, y_train) print() print('Final predictions') train_preds = ens.predict(X_train) optim_preds = ens.predict(X_test) #final_preds = ens.predict(X_holdout) print() print('Training score = {}'.format(error(train_preds, y_train))) print('Test score = {}'.format(error(optim_preds, y_test))) #print('Holdout score = {}'.format(error(final_preds, y_holdout))) ##### now we should save the model please ##### do several runs and save the best one ##### saving the features! ##### clustering as a feature??? ##### randomizing the balance of the generated data for training initial pipelines ##### we shouldn't require at least 1 hidden layer ##### for training the voting models, what if we cv split the data coming in
val_train, val_test = train_test_split(train,test_size=0.3,random_state=SEED,stratify=train['Survived']) val_Xtrain=val_train[val_train.columns[1:]] val_ytrain=val_train[val_train.columns[:1]] val_Xtest=val[val_test.columns[1:]] val_ytest=val[val_test.columns[:1]] # Instantiate the ensemble with 10 folds super_learner = SuperLearner(folds=10,random_state=SEED,verbose=2,backend='multiprocessing') # Add the base learners and the meta learner super_learner.add(list(base_learners().values()),proba=True) super_learner.add_meta(LogisticRegression(), proba=True) # Train the ensemble super_learner.fit(val_Xtrain,val_ytrain) # predict the test set p_ens = super_learner.predict(val_Xtest)[:,1] p_ens_label = 1*(p_ens>=0.5) print('The acccuracy of super learner:',metrics.accuracy_score(p_ens_label, val_ytest)) # ### Producing the Submission file # # Finally having trained and fit the base and meta learners, we can now output the predictions into the proper format for submission to the Titanic competition as follows: # In[ ]: # Generate Submission File Submission = pd.DataFrame({ 'PassengerId': PassengerId_test, 'Survived': P_ensemble }) Submission.to_csv("Submission.csv", index=False)
# Intermediate layer, keep propagating, but add a preprocessing # pipeline that selects a subset of the input ensemble.add(estimators, preprocessing=[Subset([2, 3])], propagate_features=[0, 1]) ############################################################################## # In the above example, the two first features of the original input data # will be propagated through both layers, but the second layer will not be # trained on it. Instead, it will only see the predictions made by the base # learners in the first layer. ensemble.fit(X, y) n = list(ensemble.layer_2.learners[0].learner )[0].estimator.feature_importances_.shape[0] m = ensemble.predict(X).shape[1] print("Num features seen by estimators in intermediate layer: %i" % n) print("Num features in the output array of the intermediate layer: %i" % m) ############################################################################## # .. _proba-tutorial: # # Probabilistic ensemble learning # ------------------------------- # # When the target to predict is a class label, it can often be beneficial to # let higher-order layers or the meta learner learn from *class probabilities*, # as opposed to the predicted class. Scikit-learn classifiers can return a # matrix that, for each observation in the test set, gives the probability that # the observation belongs to the a given class. While we are ultimately # interested in class membership, this information is much richer that just
from mlens.ensemble import SuperLearner from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # %% Preparing the dataset and the putput label dataset = np.loadtxt('../dataset/train.csv', dtype=str, delimiter=",") dataset, outcome = prgm1.pre_processing(dataset) partition = np.round(0.8 * dataset.shape[0]).__int__() train_set = dataset[0:partition, :] test_set = dataset[partition:, :] # %% Training test_outcome = np.array(outcome[partition:]).astype(int) train_outcome = np.array(outcome[0:partition]).astype(int) seed = 2017 np.random.seed(seed) ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) # # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) # # Fit ensemble ensemble.fit(train_set, train_outcome, gamma="auto") # # Predict preds = ensemble.predict(test_set) print("Fit data:\n%r" % ensemble.data) print("Prediction score: %.3f" % accuracy_score(preds, test_outcome))