def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def stacking(X, y, k_cv): res = [] estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.001)), ('svr', SVR(C=2000, gamma=0.001)), ("enet", ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))] reg = StackingRegressor(estimators=estimators, n_jobs=15, final_estimator=LinearRegression()) kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0) vaild_split = kfold.split(y) for i in range(k_cv): split_index = vaild_split.__next__() test_index = split_index[1] y_test = y[test_index] trainval_index = split_index[0] X_trainval = X[trainval_index, :] X_test = X[test_index, :] y_trainval = y[trainval_index] reg.fit(X_trainval, y_trainval) print((reg.score(X_trainval, y_trainval))**0.5) test_pre = reg.predict(X_test) print("accuracy: ", (r_2(y_test, test_pre))**0.5) res.append(r_2(y_test, test_pre)**0.5) print("mean acacuracy: ", np.array(res).mean()) print("mean acacuracy: ", np.array(res).mean())
def reg_ensemble_1(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() # el, el_pred = self.elastic_net_regr() # dt, dt_pred = self.decis_tree_regr() # knr, knr_pred = self.kneighbors_regr() # gbr, gbr_pred = self.gradient_boost_regr() estimators = [ # ("str", dt), # ("eln", el), ("lasso", lasso), # ("knr", knr), # ("gbr", gbr), ("lr", lr), ("rf", rf) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def main(): data = pd.read_csv('dataset/complete.csv') data.drop("CountryCode", axis=1, inplace=True) data.drop("RegionName", axis=1, inplace=True) data.drop("RegionCode", axis=1, inplace=True) data.drop("M1_Wildcard", axis=1, inplace=True) # Remove Flag Columns for (colName, colData) in data.iteritems(): if "flag" in colName.lower(): data.drop(colName, axis=1, inplace=True) if "index" in colName.lower(): data.drop(colName, axis=1, inplace=True) # remove any rows that contain 'nan' data.dropna(axis=0, how='any', inplace=True) # change datatype of Date from int to DateTime64 date_series = pd.to_datetime(data['Date'].astype(str), format='%Y-%m-%d') data['Date'] = date_series.map(dt.datetime.toordinal) # encoding country name data = pd.get_dummies(data, columns=['CountryName'], prefix=['CountryName']) for (colName, colData) in data.iteritems(): if "countryname" in colName.lower(): data.drop(colName, axis=1, inplace=True) print(data.info()) # separate feature and label data_feature = data.drop(['ConfirmedCases', 'new_cases', 'ConfirmedDeaths'], axis=1, inplace=False) data_label_total_cases = data.loc[:, 'ConfirmedCases'] data_label_total_deaths = data.loc[:, 'ConfirmedDeaths'] data_label_cases_perDay = data.loc[:, 'new_cases'] scaler = RobustScaler() features = scaler.fit_transform(data_feature) X_train, X_test, y_train, y_test = train_test_split(features, data_label_cases_perDay, test_size=0.25, random_state=42) estimators = [ ('rfr', RandomForestRegressor(random_state=42, n_estimators=50)), ('gbr', GradientBoostingRegressor(random_state=42)), ('lsvr', LinearSVR(random_state=42, max_iter=1000)), ('etr', ExtraTreesRegressor(random_state=42, criterion='mae', n_estimators=50)) ] model = StackingRegressor( estimators=estimators, final_estimator=ExtraTreesRegressor(random_state=42, n_estimators=50) ) model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) print("MAE: " + str(mae))
def test_stacking_regressor_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))] rf = RandomForestRegressor(n_estimators=10, random_state=42) reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))], final_estimator=rf, cv=5) reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) reg.fit(X_train, y_train) reg_drop.fit(X_train, y_train) assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
def Stacked_Ensemble(x_train, x_test, y_train, y_test): # Path to save model path_to_model = os.path.join("model", "StackedEnsemble.sav") # define the base models level0 = list() level0.append(('lr', LinearRegression())) level0.append(('knn', KNeighborsRegressor())) level0.append(('cart', DecisionTreeRegressor())) level0.append(('svm', SVR())) level0.append(('adaboost', AdaBoostRegressor())) # level0.append(('bayes', )) # Classifier # level0.append(('lr', LogisticRegression())) # level0.append(('knn', KNeighborsClassifier())) # level0.append(('cart', DecisionTreeClassifier())) # level0.append(('svm', SVC())) # level0.append(('bayes', GaussianNB())) # define meta learner model level1 = LinearRegression() # Classifier # level1 = LogisticRegression() # define the stacking ensemble model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5) # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5) model.fit(x_train, y_train) # Predicting y_pred = model.predict(x_test) # Printing the training results print("\n\n(Stacked Ensemble) Confusion Matrix: \n", confusion_matrix(y_true=y_test, y_pred=y_pred.round())) print("(Stacked Ensemble) Report: \n", classification_report(y_test, y_pred.round())) print("(Stacked Ensemble) Accuracy: \n", accuracy_score(y_test, y_pred.round())) # Saving the Model if not os.path.exists(os.path.dirname(path_to_model)): try: os.makedirs(os.path.dirname(path_to_model)) except OSError as exc: # Guard against race condition print("File does not exist !!!!") pickle.dump(model, open(path_to_model, 'wb')) return y_test, y_pred
def stacking_qtlmas(X_trainval, y_trainval, X_test, y_test): res = [] estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.005)), ('svr', SVR(C=2500, gamma=0.001)), ("enet", ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))] reg = StackingRegressor(estimators=estimators, n_jobs=15, final_estimator=LinearRegression()) reg.fit(X_trainval, y_trainval) print((reg.score(X_trainval, y_trainval))**0.5) test_pre = reg.predict(X_test) return test_pre
def train(prop, k_fold=5, test_size=0.2): # 0.settings set_seed(GLOBAL_SEED) cv = k_fold # cross-validation generator if cv == 1: cv = LeaveOneOut() # 1.basic learner nets knn = KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance') svr = GridSearchCV(SVR(), param_grid={"C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7)}, n_jobs=-1) ridge = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)) mlp = MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700) rf = RandomForestRegressor() gbdt = GradientBoostingRegressor() # 2.metal model net metal_model = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)) # 3.stacking model stacking_model = StackingRegressor( estimators=[('KNN', knn), ('SVR', svr), ('Ridge', ridge), ('MLP', mlp), ('RF', rf), ('GBDT', gbdt)], final_estimator=metal_model, n_jobs=-1, cv=cv # cross validation ) # 4.load data x, y = loadXY(config.data_load_path[prop]) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True) # 5.train model(stacking模型,已经内置交叉验证) stacking_model.fit(x_train, y_train) # val-scores result = cross_validate(stacking_model, x_train, y_train, scoring=['neg_mean_absolute_error','neg_mean_squared_error','r2'], cv=cv) mae_val = result['test_neg_mean_absolute_error'].mean() mse_val = result['test_neg_mean_squared_error'].mean() r2_val = result['test_r2'].mean() # test-score pred = stacking_model.predict(x_test) mae_test = sklearn.metrics.mean_absolute_error(y_test, pred).mean() mse_test = sklearn.metrics.mean_squared_error(y_test, pred).mean() r2_test = sklearn.metrics.r2_score(y_test, pred).mean() # show print("验证集: MAE:%f, MSE:%f, R2:%f\n" "测试集: MAE:%f, MSE:%f, R2:%f" % (mae_val, mse_val, r2_val, mae_test, mse_test, r2_test)) # 7.save model month_once_save_name = time.strftime('%Y-%m.pkl', time.localtime()) save_path = os.path.join(config.model_save_path[prop], month_once_save_name) file_util.save_model(stacking_model, save_path)
def reg_ensemble_4(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() estimators = [ ("lr", lr), ("rf", rf), ("lasso", lasso) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=200, n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def Stacked_Ensemble(x_train, x_test, y_train, y_test): # define the base models level0 = list() level0.append(('lr', LinearRegression())) level0.append(('knn', KNeighborsRegressor())) level0.append(('cart', DecisionTreeRegressor())) level0.append(('svm', SVR())) level0.append(('adaboost', AdaBoostRegressor())) # level0.append(('bayes', )) # Classifier # level0.append(('lr', LogisticRegression())) # level0.append(('knn', KNeighborsClassifier())) # level0.append(('cart', DecisionTreeClassifier())) # level0.append(('svm', SVC())) # level0.append(('bayes', GaussianNB())) # define meta learner model level1 = LinearRegression() # Classifier # level1 = LogisticRegression() # define the stacking ensemble model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5) # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5) model.fit(x_train, y_train) # Predicting y_pred = model.predict(x_test) # Printing the training results print("\n\n(Stacked Ensemble) Confusion Matrix: \n", confusion_matrix(y_true=y_test, y_pred=y_pred.round())) print("(Stacked Ensemble) Report: \n", classification_report(y_test, y_pred.round())) print("(Stacked Ensemble) Accuracy: \n", accuracy_score(y_test, y_pred.round())) return y_test, y_pred
def init_stacking(train_scaled, test_scaled, target, test_id): if not os.path.isfile('Data/pickles/models/pancake_stack'): estimators = [ ('rfr', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=5, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=4, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=None, oob_score=True, random_state=None, verbose=3, warm_start=False)), ('xgboost', XGBRegressor(learning_rate=0.08, max_depth=3, n_estimators=500, n_jobs=-1, reg_alpha=0.001, reg_lambda=1, verbosity=2)), ('svr', SVR(C=5, cache_size=200, coef0=0.0, degree=1, epsilon=0.01, gamma='auto', kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=3)), ('lgbm', LGBMRegressor(boosting_type='gbdt', lambda_l1=0, lambda_l2=0.1, learning_rate=0.1, max_depth=0, num_leaves=10)) ] stack = StackingRegressor(estimators=estimators, final_estimator=LassoCV(cv=5), verbose=3) stack.fit(train_scaled, target) with open('Data/pickles/models/pancake_stack', 'wb') as file: pass pickle.dump(stack, file) else: with open('Data/pickles/models/pancake_stack', 'rb') as file: stack = pickle.load(file) y_pred = stack.predict(test_scaled) y_pred = np.exp(y_pred) submission_df = pd.DataFrame(y_pred, index=test_id, columns=['SalePrice']) submission_df.to_csv('Data/Submission/S6.csv')
def reg_ensemble_5(self): """ Regressors Ensemble :return: ensempre prediction """ param = {'final_estimator__max_features': [1, 5], 'final_estimator__n_jobs': [1, -1, 5]} lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() estimators = [ ("lr", lr), ("rf", rf) ] # tss = TimeSeriesSplit(n_splits=2, test_size=10) tss = TimeSeriesSplit(gap=20, max_train_size=None, n_splits=10, test_size=None) reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=tss, n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def reg_ensemble_2(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() lor = LogisticRegression() # el, el_pred = self.elastic_net_regr() estimators = [ # ("eln", el), ("lasso", lasso), ("lr", lr), ("rf", rf) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=5, #10 n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
n_train = int(round(X.shape[0] * train_prct)) ## Models knn = KNeighborsRegressor(n_neighbors=5) svm = SVR() rf = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=0) decision_tree = DecisionTreeRegressor(max_depth=3, max_features=2) bayesian_ridge = BayesianRidge() base_models = [("KNN", knn), ("SVM", svm), ("DecisionTree", decision_tree), ("RandomForest", rf)] ## Fit stacked_learner = StackingRegressor(base_models, cv=N_FOLDS) stacked_learner = stacked_learner.fit(X[:n_train], Y[:n_train]) y_pred_test = stacked_learner.predict(X[n_train:]) residuals_stacked = Y[n_train:] - y_pred_test residuals_stacked_train = Y[:n_train] - stacked_learner.predict(X[:n_train]) adaboost = AdaBoostRegressor(n_estimators=100, loss="square", random_state=0) adaboost = adaboost.fit(X[:n_train], Y[:n_train]) y_pred_test = adaboost.predict(X[n_train:]) residuals_adaboost = Y[n_train:] - y_pred_test residuals_adaboost_train = Y[:n_train] - adaboost.predict(X[:n_train]) ## Predict on entire dataset y_pred = stacked_learner.predict(X) df = pd.DataFrame.from_dict({ "state": data.state, "population": data.population, "value": y_pred
max_features='sqrt', max_depth=5, oob_score=True)), ] stack = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=1400, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', max_depth=5, oob_score=True)) stack.fit(Xtrainv, ytrainv) stack_train_pred = stack.predict(Xtrainv) stack_val_pred = stack.predict(Xtestv) stack_test_pred = stack.predict(Xtest) stack_train_mse = mean_squared_error(ytrainv, stack_train_pred) stack_val_mse = mean_squared_error(ytestv, stack_val_pred) stack_test_mse = mean_squared_error(ytest, stack_test_pred) print("RMSE using StackRegressor:\t{}\t{}\t{}\n".format( np.sqrt(stack_train_mse), np.sqrt(stack_val_mse), np.sqrt(stack_test_mse))) df_rf = pd.DataFrame({'Actual': ytest, 'Predicted': stack_test_pred}) fig1 = pp.figure(figsize=(8, 6)) df_rf.head(n=300).plot() pp.legend() pp.title("StackRegressor Actual v/s Predicted Annual Rainfall")
#stacking regressor from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import StackingRegressor estimator = [('RF', RandomForestRegressor(random_state=4, n_estimators=700, min_samples_leaf=3, max_features=7, min_samples_split=15, warm_start=True)), ('KNN', KNeighborsRegressor(n_neighbors=7))] ms = StackingRegressor(estimators=estimator, final_estimator=LinearRegression()).fit(TrainX, TrainY) Testpred = ms.predict(TestX) Testpred = np.exp(Testpred) STRMSE = np.sqrt(np.mean((TestY - Testpred)**2)) #out FinalTest.drop(["Upvotes_log"], axis=1, inplace=True) finalpred = ms.predict(FinalTest) finalpred = np.exp(finalpred) submission = pd.DataFrame({"ID": ids, "Upvotes": finalpred}) submission.to_csv("uppy03logtrGCVST.csv", index=False) #grid search #GridX,DX, GridY, DY=train_test_split(TrainAll[featuresnames], depdnt, train_size=0.10, random_state=4) # #from sklearn.model_selection import GridSearchCV #parameters={"n_estimators":range(100,800,100), "min_samples_leaf":range(1,20,2), "min_samples_split":range(5,20,5), "max_features":range(1,5,1)}
성능을 극으로 끌어올릴 때 활용한다. 과대적합을 유발할 수 있다.(특히 데이터셋이 적은 경우) 시간이 많이 소요된다. """ from sklearn.ensemble import StackingRegressor stack_models = [ ('elasticnet', poly_pipeline), ('randomforest', rfr), ('gbr', gbr), ('lgbm', lgbm), ] stack_reg = StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1) stack_reg.fit(x_train, y_train) stack_pred = stack_reg.predict(x_test) mse_eval('Stacking Ensemble', stack_pred, y_test) ## Weighted Blending """ 각 모델의 예측값에 대하여 weight(가중치)를 곱하여 최종 output 계산 모델에 대한 가중치를 조절하여, 최종 output을 산출한다. 가중치의 합은 1.0이 되도록 한다. """ final_outputs = { 'elasticnet': poly_pred, 'randomforest': rfr_pred, 'gbr': gbr_pred, 'xgb': xgb_pred, 'lgbm': lgbm_pred,
# save the model to disk filename = 'ensemble_model' pickle.dump(model, open(filename, 'wb')) # In[78]: # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) yhat = loaded_model.predict(test_X) # In[79]: yhat = model.predict(test_X) # In[80]: #Model Evaluation print('R^2:',metrics.r2_score(test_y, yhat)) #print('Adjusted R^2:',1 - (1-metrics.r2_score(train_y, y_pred))*(len(train_y)-1)/(len(train_y)-train_X.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(test_y, yhat)) print('MAPE:',mean_absolute_percentage_error(test_y, yhat)) print('MSE:',metrics.mean_squared_error(test_y, yhat)) print('RMSE:',np.sqrt(metrics.mean_squared_error(test_y, yhat))) # In[81]:
rf = RandomForestRegressor(n_jobs=-1, max_depth=75, n_estimators=900, random_state=0) mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20), activation='relu', solver='lbfgs', alpha=0.0001, verbose=False, max_iter=400) stacking = StackingRegressor(estimators=[("mlp", mlp), ("randomForest", rf)], n_jobs=-1) stacking.fit(X_train, y_train) y_pred = stacking.predict(X_test) for i, y in enumerate(y_pred): if y_pred[i] < 0: y_pred[i] = 0 merged_pred = [] merged_pred.append(pd.Series(y_pred, name='pred_rf' + str(1))) df_test = Reader.read_data('test.csv') df_train = Reader.read_data('train.csv') X, y = Reader.select_train_columns(df_train) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) df_test = Reader.select_train_columns(df_test)[0] df_test['weather_4'] = 0
y_pred_et = regr_et.predict(x_val_scaled) rmse_et = np.sqrt(mean_squared_error(y_val_scaled, y_pred_et)) all_rmse.iloc[i - 1, 3] = rmse_et # Gradient Boosting regr_gbr = GradientBoostingRegressor(n_estimators=100, random_state=0) regr.fit(x_train_scaled, y_train_scaled) y_pred_gbr = regr_gbr.predict(x_val_scaled) rmse_gbr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_gbr)) all_rmse.iloc[i - 1, 4] = rmse_gbr # Stacking estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=0))] regr_sr = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=0)) regr.fit(x_train_scaled, y_train_scaled) y_pred_sr = regr_sr.predict(x_val_scaled) rmse_sr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_sr)) all_rmse.iloc[i - 1, 5] = rmse_sr # Voting r1 = LinearRegression() r2 = RandomForestRegressor(n_estimators=100, random_state=0) regr_vr = VotingRegressor([('lr', r1), ('rf', r2)]) regr.fit(x_train_scaled, y_train_scaled) y_pred_vr = regr_vr.predict(x_val_scaled) rmse_vr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_vr)) all_rmse.iloc[i - 1, 6] = rmse_vr # Histogram-based Gradient Boosting regr_hgbr = HistGradientBoostingRegressor(random_state=0) regr.fit(x_train_scaled, y_train_scaled)
def _BuildRegrModel(self, y, X): """Train an ensemble regression model and assess its performance. Start by splitting the y and X to train and test samples. Then create three regressors, namely a Random Forest, a Ridge and a SVM regressor and tune their hyperparameters using random search with cross validation. After updating their hyperparamters stack the three regressors using an ElasticNET linear regression model and fit the ensemble model to the train sample. Finally, calculate its performance using the test sample and return both the ensemble model and the calculated metrics. Arguments: y {numpy.ndarray} -- The response variable (i.e. the LST data) X {numpy.ndarray} -- The explanatory variables (i.e. the LST predictors) Returns: sklearn.ensemble._stacking.StackingRegressor -- The ensemble regression model tuple -- A tuple with the regression performance metrics """ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.regr_test_size, random_state=self.SEED) regressors = [ ("random forest", RandomForestRegressor(random_state=self.SEED, n_jobs=self.N_JOBS)), ("ridge", Ridge(random_state=self.SEED)), ("svr", SVR()), ] hyperparam_distributions = { "random forest": { "max_depth": stats.randint(5, 100), "n_estimators": stats.randint(30, 800), "min_samples_leaf": stats.randint(2, 20), "min_samples_split": stats.randint(2, 50), }, "svr": { "kernel": ["rbf", "poly", "sigmoid", "linear"], "degree": stats.randint(2, 7), "epsilon": stats.uniform(0.05, 5.0), "C": stats.uniform(0.0, 25.0), }, "ridge": { "alpha": stats.uniform(0.0001, 1.0) }, } for name, regressor in regressors: print(f"{f' Tuning the {name} hyperparameters...':<50}", end="") hyperparam_candidates = RandomizedSearchCV( regressor, param_distributions=[hyperparam_distributions[name]], scoring="r2", random_state=self.SEED, n_jobs=self.N_JOBS, n_iter=self.N_RANDOM_SEARCHES, verbose=0, ).fit(X_train, y_train) print( f"Done [CV R2 score = {hyperparam_candidates.best_score_:0.2f}]" ) regressor.set_params(**hyperparam_candidates.best_params_) ensemble_regressor = StackingRegressor( regressors, final_estimator=ElasticNetCV( l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0], cv=10, n_jobs=self.N_JOBS, random_state=self.SEED, ), n_jobs=self.N_JOBS, passthrough=True, ) try: ensemble_regressor.fit(X_train, y_train) except ValueError as err: raise ValueError( f"Error in _BuildRegrModel: Unable to fit ensemble regression model. {err}" ) # Assess the model performance using the test data y_pred = ensemble_regressor.predict(X_test) #y_pred = regressors[1][1].predict(X_test) regr_metrics = ( metrics.r2_score(y_test, y_pred), metrics.explained_variance_score(y_test, y_pred), metrics.max_error(y_test, y_pred), metrics.mean_absolute_error(y_test, y_pred), metrics.mean_squared_error(y_test, y_pred), metrics.median_absolute_error(y_test, y_pred), ) return ensemble_regressor, regr_metrics
from sklearn.linear_model import Lasso estimators = [('forest', RandomForestRegressor(n_estimators=500, random_state=42)), ('lr', CatBoostRegressor(100)), ('xgb', xgboost.XGBRegressor(350))] reg = StackingRegressor(estimators=estimators, final_estimator=Lasso()) reg.fit(X_train, y_train) from sklearn.metrics import mean_squared_error mean_squared_error(model.predict(X_test), y_test) submission = sample.copy() submission['PE'] = reg.predict(test.drop(columns=['PE'])) submission.to_csv('sklearn_stack4.csv', index=True) monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto', restore_best_weights=True) history2 = model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1000, callbacks=[monitor]) from keras.layers import Dense, Dropout
verbose=False, max_iter=400) rf = RandomForestRegressor(n_jobs=-1, max_depth=25, n_estimators=900, random_state=0) # adaknn = AdaBoostRegressor(base_estimator=knn, random_state=0, n_estimators=9) bagdt = BaggingRegressor(base_estimator=dt, n_estimators=300, random_state=0) # rf.fit(X_train,y_train) # pred=rf.predict(X_test) # -------------------- Stacking voting ----------------------------- stacking = StackingRegressor(estimators=[('bagdt', bagdt), ("mlp", mlp), ("randomForest", rf)], n_jobs=-1) stacking.fit(X, y) y_pred_stacking = stacking.predict(df_test) print(y_pred_stacking) # ------------------ Predict the registered ones ------------------------- # knn = KNeighborsRegressor(n_jobs=-1, n_neighbors=2, weights='distance', p=1) dt = DecisionTreeRegressor(random_state=0) mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20), activation='relu', solver='lbfgs', alpha=0.0001, verbose=False, max_iter=400) rf = RandomForestRegressor(n_jobs=-1, max_depth=25, n_estimators=900, random_state=0)
estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)), ('svr', SVR(C=1, gamma=1e-6))] from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import StackingRegressor reg = StackingRegressor( estimators=estimators, final_estimator=GradientBoostingRegressor(random_state=42)) from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) plt.figure() plt.plot(y_test[:30], 'gd', label='Original') plt.plot(y_pred[:30], 'b^', label='Stacking Regressor') plt.show() from sklearn.metrics import r2_score print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred))) #For multiple stacking layres final_layer = StackingRegressor(estimators=[ ('rf', RandomForestRegressor(random_state=42)), ('gbrt', GradientBoostingRegressor(random_state=42)) ], final_estimator=RidgeCV())
import xgboost from catboost import CatBoostRegressor knnr_scld = (make_pipeline(StandardScaler(), estimators = [ ('forest',RandomForestRegressor(n_estimators=1000,random_state=42)), ('lr', CatBoostRegressor(120)), ('xgb', xgboost.XGBRegressor(750)) ] reg = StackingRegressor( estimators=estimators, final_estimator=) reg.fit(X_train, y_train) from sklearn.metrics import mean_squared_error mean_squared_error(reg.predict(X_test), y_test) submission = sample.copy() submission['PE'] = reg.predict(test.drop(columns = ['PE'])) submission.to_csv('sklearn_stack2.csv',index = True)
#Step 1:Loading data X, y = load_boston(return_X_y=True) #Step 2:Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40) #step3:Training regression = StackingRegressor(estimators=[ ('knn', KNeighborsRegressor(n_neighbors=4, weights='distance', leaf_size=1, metric='manhattan')), ('dt', GradientBoostingRegressor(max_depth=3, n_estimators=220)) ], final_estimator=Ridge(random_state=40), cv=5, n_jobs=-1) regression.fit(X_train, y_train) score_train = regression.score(X_train, y_train) score_test = regression.score(X_test, y_test) pred_train = regression.predict(X_train) pred_test = regression.predict(X_test) rmse_train = np.sqrt(metrics.mean_squared_error(pred_train, y_train)) rmse_test = np.sqrt(metrics.mean_squared_error(pred_test, y_test)) print('RMSE:{:.2f}/{:.2f}'.format(rmse_train, rmse_test)) print('R2Score:{:.2f}/{:.2f}'.format(score_train, score_test))
df['IsNew'] = df.YearBuilt.apply(lambda x: 1 if x > 2000 else 0) df['IsOld'] = df.YearBuilt.apply(lambda x: 1 if x < 1946 else 0) df.drop('MiscFeature', axis=1, inplace=True) # ------------------------------- # df['Age'] = df['YrSold'] - df['YearBuilt'] df['BsmtTotalBathRooms'] = df['BsmtFullBath'] + df['BsmtHalfBath'] df['AbvGradeTotalBathRooms'] = df['FullBath'] + df['HalfBath'] df['Total Rooms'] = df['BedroomAbvGr'] + df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath'] \ + df['TotRmsAbvGrd'] + df['KitchenAbvGr'] stack.fit(X, y) test = scale.fit_transform(df[Importances.nlargest(int(best_col)).index]) pred = stack.predict(scale.fit_transform(test)) sub['SalePrice'] = pred sub.to_csv('submission_2.csv', index=False) # the score is around RMSE(0.3400) on Kaggle # ------- Plot best cols ------- # plt.figure(figsize=(20, 15)) Importances.nlargest(int(best_col)).plot(kind='barh') plt.show()
Y = vectorizedData[:, -1] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) baseModels = [('ridgeRegressor', linear_model.Ridge(alpha=0.01)), ('randomForestRegressor', RandomForestRegressor(max_depth=10, random_state=0, n_estimators=15, max_features=0.5)), ('supportVectorRegressor', svm.SVR(C=10, epsilon=0.5))] stackedRegressor = StackingRegressor(estimators=baseModels) stackedRegressor.fit(X_train, Y_train) trainingError = np.mean((stackedRegressor.predict(X_train) - Y_train)**2) print("Training Error: %.6f" % trainingError) Y_predict_unscaled = stackedRegressor.predict(X_test) testingError = np.mean((Y_predict_unscaled - Y_test)**2) print("Testing Error: %.6f" % testingError) meanScore = np.mean(imdbScores) standDeviation = np.std(imdbScores) Y_predict = Y_predict_unscaled * standDeviation + meanScore errorsAllowed = [ 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45, 1.5 ] predictionAccuracyList = [] for errorAllowed in errorsAllowed: numRightPredicted = 0
n_estimators=1000, max_depth=7, min_child_weight=1, gamma=0, subsample=0.95, colsample_bytree=0.55, reg_alpha=0.00001, nthread=1, seed=0) reg = StackingRegressor(estimators=estims, final_estimator=f_estimator, passthrough=True) reg.fit(X_train_betta_f, y_train_betta) y_train_pred_betta = reg.predict(X_train_betta_f) y_test_pred_betta = reg.predict(X_test_betta_f) r2_train_betta = r2_score(y_train_betta, y_train_pred_betta) rmse_train_betta = mean_squared_error(y_train_betta, y_train_pred_betta, squared=False) print("R2: {0:.3f}, RMSE: {1:.5f}".format( r2_train_betta, rmse_train_betta)) # R2: 0.992, RMSE: 0.00543 # на прошлом стэкинге было R2: 0.978, RMSE: 0.00904 r2_test_betta = r2_score(y_test_betta, y_test_pred_betta) rmse_test_betta = mean_squared_error(y_test_betta, y_test_pred_betta, squared=False)
plt.title("Voting Ensemble Regression") plt.legend() plt.show() # Heterogeneous Ensembles(Stacking) models = [("LR", lr), ("DT", regr_tree), ("SVR", svr)] # instead of choosing model weights, stacking uses a meta learner # models training happens twice. once for base models, once for meta learner meta_learner_reg = LinearRegression() s_reg = StackingRegressor(estimators=models, final_estimator=meta_learner_reg) s_reg.fit(x_train, y_train[:, 0]) y_pred = s_reg.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f"RMSE: {rmse}") # Plot stacking regression prediction line over data x_domain = np.linspace(min(x_train), max(x_train), 100) y_pred_rescaled = y_scaler.inverse_transform(s_reg.predict(x_domain)) x_rescaled = x_scaler.inverse_transform(x_domain) plt.figure() plt.scatter(X, y) plt.plot(x_rescaled, y_pred_rescaled, color='red', label='predictions') plt.xlabel("LotArea in m$^2$") plt.ylabel("SalePrice in ZAR") plt.title("Stacking Ensemble Regression")