def stacking(X, y, k_cv): res = [] estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.001)), ('svr', SVR(C=2000, gamma=0.001)), ("enet", ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))] reg = StackingRegressor(estimators=estimators, n_jobs=15, final_estimator=LinearRegression()) kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0) vaild_split = kfold.split(y) for i in range(k_cv): split_index = vaild_split.__next__() test_index = split_index[1] y_test = y[test_index] trainval_index = split_index[0] X_trainval = X[trainval_index, :] X_test = X[test_index, :] y_trainval = y[trainval_index] reg.fit(X_trainval, y_trainval) print((reg.score(X_trainval, y_trainval))**0.5) test_pre = reg.predict(X_test) print("accuracy: ", (r_2(y_test, test_pre))**0.5) res.append(r_2(y_test, test_pre)**0.5) print("mean acacuracy: ", np.array(res).mean()) print("mean acacuracy: ", np.array(res).mean())
def reg_ensemble_1(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() # el, el_pred = self.elastic_net_regr() # dt, dt_pred = self.decis_tree_regr() # knr, knr_pred = self.kneighbors_regr() # gbr, gbr_pred = self.gradient_boost_regr() estimators = [ # ("str", dt), # ("eln", el), ("lasso", lasso), # ("knr", knr), # ("gbr", gbr), ("lr", lr), ("rf", rf) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def lvl2_xgb_randomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(param_dir, 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()}) for k, v in model_results.items()} preprocess_pipeline = pp_selector(pp_choice) lvl1_pipeline = [ (model_name, Pipeline([ ('preprocess', preprocess_pipeline), (model_name, model_object[model_name]) ]) ) for model_name in model_store] final_estimator_params = {'final_estimator__final_est__n_estimators': scipy.stats.randint(150, 1000), 'final_estimator__final_est__learning_rate': scipy.stats.uniform(0.01, 0.59), 'final_estimator__final_est__subsample': scipy.stats.uniform(0.3, 0.6), 'final_estimator__final_est__max_depth': scipy.stats.randint(1, 16), 'final_estimator__final_est__colsample_bytree': scipy.stats.uniform(0.5, 0.4), 'final_estimator__final_est__min_child_weight': [1, 2, 3, 4], 'final_estimator__final_est__gamma': scipy.stats.expon(scale=0.05), } if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), ('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) else: final_est = XGBRegressor() est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough) est = RandomizedSearchCV(est, param_distributions=final_estimator_params, cv=5, n_iter=100, scoring=make_scorer(rmsle, greater_is_better=False), verbose=1, n_jobs=-1) est.fit(x_train, y_train) score = {'lvl2_xgb': est.cv_results_} results_dir = create_results_directory(results_dir) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(score, f)
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def main(): data = pd.read_csv('dataset/complete.csv') data.drop("CountryCode", axis=1, inplace=True) data.drop("RegionName", axis=1, inplace=True) data.drop("RegionCode", axis=1, inplace=True) data.drop("M1_Wildcard", axis=1, inplace=True) # Remove Flag Columns for (colName, colData) in data.iteritems(): if "flag" in colName.lower(): data.drop(colName, axis=1, inplace=True) if "index" in colName.lower(): data.drop(colName, axis=1, inplace=True) # remove any rows that contain 'nan' data.dropna(axis=0, how='any', inplace=True) # change datatype of Date from int to DateTime64 date_series = pd.to_datetime(data['Date'].astype(str), format='%Y-%m-%d') data['Date'] = date_series.map(dt.datetime.toordinal) # encoding country name data = pd.get_dummies(data, columns=['CountryName'], prefix=['CountryName']) for (colName, colData) in data.iteritems(): if "countryname" in colName.lower(): data.drop(colName, axis=1, inplace=True) print(data.info()) # separate feature and label data_feature = data.drop(['ConfirmedCases', 'new_cases', 'ConfirmedDeaths'], axis=1, inplace=False) data_label_total_cases = data.loc[:, 'ConfirmedCases'] data_label_total_deaths = data.loc[:, 'ConfirmedDeaths'] data_label_cases_perDay = data.loc[:, 'new_cases'] scaler = RobustScaler() features = scaler.fit_transform(data_feature) X_train, X_test, y_train, y_test = train_test_split(features, data_label_cases_perDay, test_size=0.25, random_state=42) estimators = [ ('rfr', RandomForestRegressor(random_state=42, n_estimators=50)), ('gbr', GradientBoostingRegressor(random_state=42)), ('lsvr', LinearSVR(random_state=42, max_iter=1000)), ('etr', ExtraTreesRegressor(random_state=42, criterion='mae', n_estimators=50)) ] model = StackingRegressor( estimators=estimators, final_estimator=ExtraTreesRegressor(random_state=42, n_estimators=50) ) model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) print("MAE: " + str(mae))
def stacking_regressor(estimators, final_estimator, data, labels, args={}): """ Stacking算法:通过多个模型降低bias, 回归 """ from sklearn.ensemble import StackingRegressor reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, **args) reg.fit(data, labels) return reg
def Stacked_Ensemble(x_train, x_test, y_train, y_test): # Path to save model path_to_model = os.path.join("model", "StackedEnsemble.sav") # define the base models level0 = list() level0.append(('lr', LinearRegression())) level0.append(('knn', KNeighborsRegressor())) level0.append(('cart', DecisionTreeRegressor())) level0.append(('svm', SVR())) level0.append(('adaboost', AdaBoostRegressor())) # level0.append(('bayes', )) # Classifier # level0.append(('lr', LogisticRegression())) # level0.append(('knn', KNeighborsClassifier())) # level0.append(('cart', DecisionTreeClassifier())) # level0.append(('svm', SVC())) # level0.append(('bayes', GaussianNB())) # define meta learner model level1 = LinearRegression() # Classifier # level1 = LogisticRegression() # define the stacking ensemble model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5) # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5) model.fit(x_train, y_train) # Predicting y_pred = model.predict(x_test) # Printing the training results print("\n\n(Stacked Ensemble) Confusion Matrix: \n", confusion_matrix(y_true=y_test, y_pred=y_pred.round())) print("(Stacked Ensemble) Report: \n", classification_report(y_test, y_pred.round())) print("(Stacked Ensemble) Accuracy: \n", accuracy_score(y_test, y_pred.round())) # Saving the Model if not os.path.exists(os.path.dirname(path_to_model)): try: os.makedirs(os.path.dirname(path_to_model)) except OSError as exc: # Guard against race condition print("File does not exist !!!!") pickle.dump(model, open(path_to_model, 'wb')) return y_test, y_pred
def stacking_qtlmas(X_trainval, y_trainval, X_test, y_test): res = [] estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.005)), ('svr', SVR(C=2500, gamma=0.001)), ("enet", ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))] reg = StackingRegressor(estimators=estimators, n_jobs=15, final_estimator=LinearRegression()) reg.fit(X_trainval, y_trainval) print((reg.score(X_trainval, y_trainval))**0.5) test_pre = reg.predict(X_test) return test_pre
def train(prop, k_fold=5, test_size=0.2): # 0.settings set_seed(GLOBAL_SEED) cv = k_fold # cross-validation generator if cv == 1: cv = LeaveOneOut() # 1.basic learner nets knn = KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance') svr = GridSearchCV(SVR(), param_grid={"C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7)}, n_jobs=-1) ridge = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)) mlp = MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700) rf = RandomForestRegressor() gbdt = GradientBoostingRegressor() # 2.metal model net metal_model = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)) # 3.stacking model stacking_model = StackingRegressor( estimators=[('KNN', knn), ('SVR', svr), ('Ridge', ridge), ('MLP', mlp), ('RF', rf), ('GBDT', gbdt)], final_estimator=metal_model, n_jobs=-1, cv=cv # cross validation ) # 4.load data x, y = loadXY(config.data_load_path[prop]) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True) # 5.train model(stacking模型,已经内置交叉验证) stacking_model.fit(x_train, y_train) # val-scores result = cross_validate(stacking_model, x_train, y_train, scoring=['neg_mean_absolute_error','neg_mean_squared_error','r2'], cv=cv) mae_val = result['test_neg_mean_absolute_error'].mean() mse_val = result['test_neg_mean_squared_error'].mean() r2_val = result['test_r2'].mean() # test-score pred = stacking_model.predict(x_test) mae_test = sklearn.metrics.mean_absolute_error(y_test, pred).mean() mse_test = sklearn.metrics.mean_squared_error(y_test, pred).mean() r2_test = sklearn.metrics.r2_score(y_test, pred).mean() # show print("验证集: MAE:%f, MSE:%f, R2:%f\n" "测试集: MAE:%f, MSE:%f, R2:%f" % (mae_val, mse_val, r2_val, mae_test, mse_test, r2_test)) # 7.save model month_once_save_name = time.strftime('%Y-%m.pkl', time.localtime()) save_path = os.path.join(config.model_save_path[prop], month_once_save_name) file_util.save_model(stacking_model, save_path)
def test_stacking_regressor_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split( sparse.coo_matrix(scale(X_diabetes)).asformat(fmt), y_diabetes, random_state=42 ) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] rf = RandomForestRegressor(n_estimators=10, random_state=42) clf = StackingRegressor( estimators=estimators, final_estimator=rf, cv=5, passthrough=True ) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -10:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def test_stacking_regression(): from sklearn.model_selection import train_test_split from sklearn.datasets import load_diabetes from sklearn.linear_model import RidgeCV from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import StackingRegressor X, y = load_diabetes(return_X_y=True) estimators = [('gbm', xgb.sklearn.XGBRegressor(objective='reg:squarederror')), ('lr', RidgeCV())] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=10, random_state=42)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test)
def fit(self, X, y, random_state=None): """ Train ENOLS on the given training set. Parameters ---------- X: an input array of shape (n_sample, n_features) y: an array of shape (n_sample,) containing the classes for the input examples Return ------ self: the fitted model """ # use random instead of np.random to sample random numbers below random = check_random_state(random_state) estimators = [('lr', LinearRegression())] if isinstance(self.sample_size, int): self.sample_size = 'reservoir_sampling' # add all the trained OLS models to this list self.estimators_lr, self.estimators_TSR, self.estimators_enols = [], [], [] for i in range(self.n_estimators): samples = sample_without_replacement(n_population=random.choice([50, 100]), n_samples=random.choice([10, 20]), random_state=random_state, method=self.sample_size) X_train, y_train = [], [] for i in samples: X_train.append(X[i]), y_train.append(y[i]) reg = LinearRegression() reg.fit(np.array(X_train), np.array(y_train)) tsr = TheilSenRegressor() tsr.fit(np.array(X_train), np.array(y_train)) enol = StackingRegressor(estimators=estimators, final_estimator=LinearRegression()) enol.fit(np.array(X_train), np.array(y_train)) self.estimators_lr.append(reg), self.estimators_TSR.append(tsr), self.estimators_enols.append(enol) return self
def reg_ensemble_4(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() estimators = [ ("lr", lr), ("rf", rf), ("lasso", lasso) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=200, n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def test_stacking_regressor_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))] rf = RandomForestRegressor(n_estimators=10, random_state=42) reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))], final_estimator=rf, cv=5) reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) reg.fit(X_train, y_train) reg_drop.fit(X_train, y_train) assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
def Stacked_Ensemble(x_train, x_test, y_train, y_test): # define the base models level0 = list() level0.append(('lr', LinearRegression())) level0.append(('knn', KNeighborsRegressor())) level0.append(('cart', DecisionTreeRegressor())) level0.append(('svm', SVR())) level0.append(('adaboost', AdaBoostRegressor())) # level0.append(('bayes', )) # Classifier # level0.append(('lr', LogisticRegression())) # level0.append(('knn', KNeighborsClassifier())) # level0.append(('cart', DecisionTreeClassifier())) # level0.append(('svm', SVC())) # level0.append(('bayes', GaussianNB())) # define meta learner model level1 = LinearRegression() # Classifier # level1 = LogisticRegression() # define the stacking ensemble model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5) # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5) model.fit(x_train, y_train) # Predicting y_pred = model.predict(x_test) # Printing the training results print("\n\n(Stacked Ensemble) Confusion Matrix: \n", confusion_matrix(y_true=y_test, y_pred=y_pred.round())) print("(Stacked Ensemble) Report: \n", classification_report(y_test, y_pred.round())) print("(Stacked Ensemble) Accuracy: \n", accuracy_score(y_test, y_pred.round())) return y_test, y_pred
def init_stacking(train_scaled, test_scaled, target, test_id): if not os.path.isfile('Data/pickles/models/pancake_stack'): estimators = [ ('rfr', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=5, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=4, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=None, oob_score=True, random_state=None, verbose=3, warm_start=False)), ('xgboost', XGBRegressor(learning_rate=0.08, max_depth=3, n_estimators=500, n_jobs=-1, reg_alpha=0.001, reg_lambda=1, verbosity=2)), ('svr', SVR(C=5, cache_size=200, coef0=0.0, degree=1, epsilon=0.01, gamma='auto', kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=3)), ('lgbm', LGBMRegressor(boosting_type='gbdt', lambda_l1=0, lambda_l2=0.1, learning_rate=0.1, max_depth=0, num_leaves=10)) ] stack = StackingRegressor(estimators=estimators, final_estimator=LassoCV(cv=5), verbose=3) stack.fit(train_scaled, target) with open('Data/pickles/models/pancake_stack', 'wb') as file: pass pickle.dump(stack, file) else: with open('Data/pickles/models/pancake_stack', 'rb') as file: stack = pickle.load(file) y_pred = stack.predict(test_scaled) y_pred = np.exp(y_pred) submission_df = pd.DataFrame(y_pred, index=test_id, columns=['SalePrice']) submission_df.to_csv('Data/Submission/S6.csv')
def reg_ensemble_5(self): """ Regressors Ensemble :return: ensempre prediction """ param = {'final_estimator__max_features': [1, 5], 'final_estimator__n_jobs': [1, -1, 5]} lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() estimators = [ ("lr", lr), ("rf", rf) ] # tss = TimeSeriesSplit(n_splits=2, test_size=10) tss = TimeSeriesSplit(gap=20, max_train_size=None, n_splits=10, test_size=None) reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=tss, n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def lvl2_generate_prediction(rawdf, x_test, results_dir, lvl1_results_dir, type_, pp_choice, passthrough=False, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_names = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(f'{lvl1_results_dir}/results_store.pkl', 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} lvl1_pipeline = [ (model_name, Pipeline([ ('preprocess', pp_selector(pp_choice)), (model_name, model_object[model_name]) ]).set_params(**model_results[model_name].loc[0, 'params'])) for model_name in model_names] if type_ == 'lvl2_ridgecv': est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=RidgeCV(), passthrough=False) elif type_ == 'lvl2_xgb': if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), ('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) else: final_est = XGBRegressor() est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough) with open(f'{results_dir}/results_store.pkl', 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} #est.set_params( # **{f'final_estimator__{k}': v for k, v in model_results['lvl2ptvs_xgb'].loc[0, 'params'].items()}) est.set_params(**model_results['lvl2ptvs_xgb'].loc[0, 'params']) prediction = est.fit(x_train, y_train).predict(x_test) sub = pd.DataFrame() sub['Id'] = x_test['Id'] sub['SalePrice'] = prediction sub.to_csv(f'{results_dir}/{type_}_pp{pp_choice}_predictions.csv', index=False)
def reg_ensemble_2(self): """ Regressors Ensemble :return: ensempre prediction """ lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() lor = LogisticRegression() # el, el_pred = self.elastic_net_regr() estimators = [ # ("eln", el), ("lasso", lasso), ("lr", lr), ("rf", rf) ] reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(), cv=5, #10 n_jobs=-1) reg.fit(self.x_train, self.y_train) return reg.predict(self.x_test)
def test_stacking_regression(self): self._init_ray() from sklearn.model_selection import train_test_split from sklearn.datasets import load_diabetes from sklearn.linear_model import RidgeCV from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import StackingRegressor X, y = load_diabetes(return_X_y=True) estimators = [ ("gbm", RayXGBRegressor(objective="reg:squarederror")), ("lr", RidgeCV()), ] reg = StackingRegressor( estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=10, random_state=42), ) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test)
#Step 1:Loading data X, y = load_boston(return_X_y=True) #Step 2:Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40) #step3:Training regression = StackingRegressor(estimators=[ ('knn', KNeighborsRegressor(n_neighbors=4, weights='distance', leaf_size=1, metric='manhattan')), ('dt', GradientBoostingRegressor(max_depth=3, n_estimators=220)) ], final_estimator=Ridge(random_state=40), cv=5, n_jobs=-1) regression.fit(X_train, y_train) score_train = regression.score(X_train, y_train) score_test = regression.score(X_test, y_test) pred_train = regression.predict(X_train) pred_test = regression.predict(X_test) rmse_train = np.sqrt(metrics.mean_squared_error(pred_train, y_train)) rmse_test = np.sqrt(metrics.mean_squared_error(pred_test, y_test)) print('RMSE:{:.2f}/{:.2f}'.format(rmse_train, rmse_test)) print('R2Score:{:.2f}/{:.2f}'.format(score_train, score_test))
df['IsNew'] = df.YearBuilt.apply(lambda x: 1 if x > 2000 else 0) df['IsOld'] = df.YearBuilt.apply(lambda x: 1 if x < 1946 else 0) df.drop('MiscFeature', axis=1, inplace=True) # ------------------------------- # df['Age'] = df['YrSold'] - df['YearBuilt'] df['BsmtTotalBathRooms'] = df['BsmtFullBath'] + df['BsmtHalfBath'] df['AbvGradeTotalBathRooms'] = df['FullBath'] + df['HalfBath'] df['Total Rooms'] = df['BedroomAbvGr'] + df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath'] \ + df['TotRmsAbvGrd'] + df['KitchenAbvGr'] stack.fit(X, y) test = scale.fit_transform(df[Importances.nlargest(int(best_col)).index]) pred = stack.predict(scale.fit_transform(test)) sub['SalePrice'] = pred sub.to_csv('submission_2.csv', index=False) # the score is around RMSE(0.3400) on Kaggle # ------- Plot best cols ------- # plt.figure(figsize=(20, 15)) Importances.nlargest(int(best_col)).plot(kind='barh') plt.show()
params_XGB = { 'reg_alpha': 0.001, 'eta': 0.03, 'reg_lambda': 0.001, 'max_depth': 4, 'n_estimators': 1000, 'colsample_bytree': 0.6, 'subsample': 0.6 } XGB_reg.set_params(**params_XGB) lr_lasso = Lasso(max_iter=10000, alpha=0.0002) lr_ridge = Ridge(max_iter=10000, alpha=1.298710621242485) #Creating stacked model estimators = [('lasso', lr_lasso), ('xgb', XGB_reg), ('ridge', lr_ridge)] reg = StackingRegressor(estimators=estimators) reg.fit(X_train, y_train) #Creating submission submission_creator(reg, '_RidgeXGBLassoStack') #Creating averaged model vot = VotingRegressor(estimators=estimators) #Creating submission vot.fit(X_train, y_train) submission_creator(vot, '_RidgeXGBLassoAverage')
alpha=0.0001, verbose=False, max_iter=400) rf = RandomForestRegressor(n_jobs=-1, max_depth=25, n_estimators=900, random_state=0) # adaknn = AdaBoostRegressor(base_estimator=knn, random_state=0, n_estimators=9) bagdt = BaggingRegressor(base_estimator=dt, n_estimators=300, random_state=0) # rf.fit(X_train,y_train) # pred=rf.predict(X_test) # -------------------- Stacking voting ----------------------------- stacking = StackingRegressor(estimators=[('bagdt', bagdt), ("mlp", mlp), ("randomForest", rf)], n_jobs=-1) stacking.fit(X, y) y_pred_stacking = stacking.predict(df_test) print(y_pred_stacking) # ------------------ Predict the registered ones ------------------------- # knn = KNeighborsRegressor(n_jobs=-1, n_neighbors=2, weights='distance', p=1) dt = DecisionTreeRegressor(random_state=0) mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20), activation='relu', solver='lbfgs', alpha=0.0001, verbose=False, max_iter=400) rf = RandomForestRegressor(n_jobs=-1, max_depth=25, n_estimators=900,
train_prct = 0.8 n_train = int(round(X.shape[0] * train_prct)) ## Models knn = KNeighborsRegressor(n_neighbors=5) svm = SVR() rf = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=0) decision_tree = DecisionTreeRegressor(max_depth=3, max_features=2) bayesian_ridge = BayesianRidge() base_models = [("KNN", knn), ("SVM", svm), ("DecisionTree", decision_tree), ("RandomForest", rf)] ## Fit stacked_learner = StackingRegressor(base_models, cv=N_FOLDS) stacked_learner = stacked_learner.fit(X[:n_train], Y[:n_train]) y_pred_test = stacked_learner.predict(X[n_train:]) residuals_stacked = Y[n_train:] - y_pred_test residuals_stacked_train = Y[:n_train] - stacked_learner.predict(X[:n_train]) adaboost = AdaBoostRegressor(n_estimators=100, loss="square", random_state=0) adaboost = adaboost.fit(X[:n_train], Y[:n_train]) y_pred_test = adaboost.predict(X[n_train:]) residuals_adaboost = Y[n_train:] - y_pred_test residuals_adaboost_train = Y[:n_train] - adaboost.predict(X[:n_train]) ## Predict on entire dataset y_pred = stacked_learner.predict(X) df = pd.DataFrame.from_dict({ "state": data.state, "population": data.population,
min_samples_leaf=2, max_features='sqrt', max_depth=5, oob_score=True)), ] stack = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=1400, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', max_depth=5, oob_score=True)) stack.fit(Xtrainv, ytrainv) stack_train_pred = stack.predict(Xtrainv) stack_val_pred = stack.predict(Xtestv) stack_test_pred = stack.predict(Xtest) stack_train_mse = mean_squared_error(ytrainv, stack_train_pred) stack_val_mse = mean_squared_error(ytestv, stack_val_pred) stack_test_mse = mean_squared_error(ytest, stack_test_pred) print("RMSE using StackRegressor:\t{}\t{}\t{}\n".format( np.sqrt(stack_train_mse), np.sqrt(stack_val_mse), np.sqrt(stack_test_mse))) df_rf = pd.DataFrame({'Actual': ytest, 'Predicted': stack_test_pred}) fig1 = pp.figure(figsize=(8, 6)) df_rf.head(n=300).plot() pp.legend()
# 개별 모델이 예측한 데이터를 기반으로 final_estimator 종합하여 예측을 수행한다. 성능을 극으로 끌어올릴 때 활용한다. 과대적합을 유발할 수 있다.(특히 데이터셋이 적은 경우) 시간이 많이 소요된다. """ from sklearn.ensemble import StackingRegressor stack_models = [ ('elasticnet', poly_pipeline), ('randomforest', rfr), ('gbr', gbr), ('lgbm', lgbm), ] stack_reg = StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1) stack_reg.fit(x_train, y_train) stack_pred = stack_reg.predict(x_test) mse_eval('Stacking Ensemble', stack_pred, y_test) ## Weighted Blending """ 각 모델의 예측값에 대하여 weight(가중치)를 곱하여 최종 output 계산 모델에 대한 가중치를 조절하여, 최종 output을 산출한다. 가중치의 합은 1.0이 되도록 한다. """ final_outputs = { 'elasticnet': poly_pred, 'randomforest': rfr_pred, 'gbr': gbr_pred, 'xgb': xgb_pred,
class AnalyticalModel: scorer_list = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'] def __init__(self, data: pd.DataFrame, target: str, training_split=0.8, one_out=False, model_config=None, random_seed=42, cv_folds=10, cv_reps=20): self.target = target self.data = data self.attribute_data = self.data.drop(target, axis=1) self.target_data = self.data[[target]] self.attributes = self.attribute_data.columns self.one_out = one_out self.training_split = training_split self.random_seed = random_seed self.cv_folds = cv_folds self.cv_reps =cv_reps if one_out: self.x_train, self.x_test, self.y_train, self.y_test = ( self.attribute_data, self.attribute_data, self.target_data, self.target_data ) else: self.x_train, self.x_test, self.y_train, self.y_test = sk.model_selection.train_test_split( self.attribute_data, self.target_data, test_size=(1. - self.training_split), random_state=self.random_seed ) # self.train_n = int(self.data.shape[0] * self.training_split) if not self.one_out else int(self.data.shape[0]) self.train_n = int(self.x_train.shape[0]) self.model = None self.model_configs = model_config self.results = None self.score = None self.confusion = None self.coef = None self.r2 = None self.r2_adjusted = None self.mse = None self.rmse = None self.anderson = None self.anderson_p = None self.residuals = None self.predictions = None self.aic = None self.aaic = None self.bic = None self.eval = None self.build_model() def build_mlr(self, params): """ Build, fit and predict with a multiple linear regression model. :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), sk.linear_model.LinearRegression(**params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.score = self.model.score(self.x_test, self.y_test) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_linear_svr(self, params): """ Build, fit and predict with a Linear Support Vector Regressor :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), sk.svm.LinearSVR(random_state=self.random_seed, tol=1e-4, max_iter=5000, C=1, **params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_gbr(self, params): """ Build, fit and predict with a Gradient Boost Regressor :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), GradientBoostingRegressor(random_state=self.random_seed, **params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_elastic_net(self, params): """ Build, fit and predict with an Elastic Net CV :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), sk.linear_model.ElasticNetCV(**params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_rfr(self, params): """ Build, fit and predict with a Random Forest Regressor :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), RandomForestRegressor(random_state=self.random_seed, **params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_svr(self, params): """ Build, fit and predict with a Support Vector Regressor :param params: :return: """ self.model = make_pipeline( sk.preprocessing.StandardScaler(), sk.svm.SVR(kernel='rbf',tol=1e-4,max_iter=5000, C=1, **params) ) y = self.y_train.to_numpy().flatten() self.results = self.model.fit(self.x_train, y) self.predictions = self.results.predict(self.x_test) self.coef = None y_test_f = self.y_test.to_numpy().flatten() res = (y_test_f - self.predictions) self.residuals = res def build_stacker(self, train_x, train_y, test_x, test_y, params): """ Build, fit and predict with a stacking regressor ensemble. :param train_x: :param train_y: :param test_x: :param test_y: :param params: :return: """ # n_train_x = sk.preprocessing.scale(train_x, axis=1) if "estimators" in params.keys(): estimators = [] for e in params["estimators"]: # example estimator would be 'linear_model.RidgeCV', where the group and type must match the scikit-learn model sm = e.split(".") estimator = (sm[1], getattr(getattr(sk, sm[0]), sm[1])) estimators.append(estimator) else: estimators = [ ('lr', sk.linear_model.LinearRegression()), # ('svr', sk.svm.LinearSVR(random_state=42)), ('enet', sk.linear_model.ElasticNetCV()), ('ridge', sk.linear_model.RidgeCV()) ] self.model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(random_state=42), passthrough=False, n_jobs=-1) self.results = self.model.fit(train_x, train_y) pred_y = self.results.predict(test_x) self.predictions = pred_y test_y = test_y.to_numpy().flatten() self.coef = None res = test_y - pred_y self.residuals = res def build_model(self, weights=None): test_n = self.train_n if not self.one_out else 0 test_m = self.data.shape[0] if not self.one_out else self.data.shape[0] train_data = self.data[0:self.train_n] test_data = self.data[test_n:test_m] train_x = train_data[self.attributes] train_y = train_data[[self.target]] test_x = test_data[self.attributes] test_y = test_data[[self.target]] weights = np.ones(train_x.shape[0]) if weights is None else weights y = train_y.to_numpy().flatten() x = train_x.to_numpy() model_configs = {} if self.model_configs is None else self.model_configs if "type" in model_configs.keys(): params = model_configs["params"] if "params" in model_configs.keys() else {} if model_configs["type"] == "MLR": self.build_mlr(params) elif model_configs["type"] == "LinearSVR": self.build_linear_svr(params) elif model_configs["type"] == "GBR": self.build_gbr(params) elif model_configs["type"] == "RFR": self.build_rfr(params) elif model_configs["type"] == "SVR": self.build_svr(params) elif model_configs["type"] == "ElasticNetCV": self.build_elastic_net(params) # elif model_configs["type"] == "Stacker": # self.build_stacker(params) else: self.build_mlr(params) else: model_configs["type"] = "MLR" self.model_configs = model_configs self.build_mlr({}) n = float(self.data.shape[0]) p = float(self.data.shape[1] - 1.) sse = np.sum(np.power(self.residuals, 2)) sst = np.sum(np.power(test_y - np.mean(test_y), 2)) self.r2 = ((sst - sse) / sst).round(4) self.r2_adjusted = (self.r2 - (1. - self.r2) * 2. / (n - 3.)).round(4) self.rmse = (np.sqrt(sse / (n - p - 1.))).round(4) self.mse = (np.power(self.rmse, 2)).round(4) self.aic = (n * np.log(sse / n) + (2. * p) + n + 2.).round(4) self.aaic = (self.aic + (2. * (p + 1.) * (p + 2.))/(n - p - 2.)).round(4) self.bic = ((n * np.log(sse/n)) + (p * np.log(n))).round(4) self.results.aic = self.aic self.results.bic = self.bic self.anderson = scipy.stats.anderson(self.residuals) self.anderson_pvalue(replicate=True) def anderson_pvalue(self, replicate=True): ad = self.anderson.statistic if replicate: if ad < 2: p = 1. - np.exp(-1.2337141/ad) / np.sqrt(ad) * (2.00012+(.247105-(.0649821-(.0347962-(.011672-.00168691*ad)*ad)*ad)*ad)*ad) else: p = 1. - np.exp(-1.*np.exp(1.0776-(2.30695-(.43424-(.082433-(.008056 -.0003146*ad)*ad)*ad)*ad)*ad)) else: # https://www.spcforexcel.com/knowledge/basic-statistics/anderson-darling-test-for-normality ad = ad * (1. + (.75/50.) + 2.25/(50.**2)) if ad >= 0.6: p = 1. - np.exp(1.2937 - 5.709*ad + 0.0186*(ad**2)) elif 0.34 < ad < 0.6: p = 1. - np.exp(0.9177 - 4.279*ad - 1.38*(ad**2)) elif 0.2 < ad < 0.34: p = 1.0 - np.exp(-8.318 + 42.796*ad - 59.938*(ad**2)) else: p = 1.0 - np.exp(-13.436 + 101.14*ad - 223.73*(ad**2)) self.anderson_p = p def evaluate_VIF(self, threshold=5.0): valid = True subset = self.data[list(self.attributes)] if len(self.attributes) > 1: for i in range(0, len(self.attributes)): subset_data = subset.drop(self.attributes[i], axis=1) mod = sm.OLS(subset[self.attributes[i]], sm.add_constant(subset_data)) res = mod.fit() vif2 = 1. / (1. - res.rsquared) if vif2 > threshold: valid = False break if valid: return True else: return False def evaluate(self, use="rmse", ad=True, check_VIF=False, exclude=True): use = use.lower() self.eval = use if use == "r2": metric = abs(self.r2) - 1.0 elif use == "r2a": metric = abs(self.results.rsquared_adj) - 1.0 elif use == "rmse": metric = self.rmse elif use == "press": r = smo.OLSInfluence(self.results) metric = r.ess_press elif use == "aic": metric = self.aic elif use == "caic": k = self.data.shape[1] - 1 n = self.results.nobs metric = self.aic + ((2*(k*k) + 2*k)/(n - k - 1)) elif use == "bic": metric = self.bic else: metric = self.mse if ad: if self.anderson_p < 0.05: if exclude: metric = float("inf") else: metric = 10000 if check_VIF: if not self.evaluate_VIF(): if exclude: metric = float("inf") else: metric = 10000 # Allows for model to still be on the list but will let better models get added. return metric def plot_results(self): test_data = self.data[self.train_n:] if not self.one_out else self.data[0:self.train_n] test_y = test_data[[self.target]] pred_y = self.predictions plt.subplot(2, 1, 1) plt.title("{} Model Results ({}: {}) \n Attributes: {}".format( self.model_configs["type"], self.eval, getattr(self, self.eval), ", ".join(list(self.attributes))) ) plot_x = np.arange(0, self.residuals.shape[0]) plt.scatter(plot_x, test_y, color='gray', linewidth=1) plt.scatter(plot_x, pred_y, color='red', linewidth=1) plt.ylabel("Prediction/Actual") plt.axhline(y=np.mean(pred_y), linewidth=0.5, color='black') red_patch = mpatches.Patch(color='red', label='Prediction') gray_patch = mpatches.Patch(color='gray', label='Actual') plt.legend(handles=[gray_patch, red_patch]) plt.subplot(2, 1, 2) plt.scatter(pred_y, self.residuals, facecolors='none', edgecolors='blue') plt.axhline(linewidth=0.5, color='black') plt.ylabel("Fitted vs Residuals") plt.show() def print_summary(self): test_data = self.data[self.train_n:] if not self.one_out else self.data[0:self.train_n] test_y = test_data[[self.target]] pred_y = self.predictions max_error = skm.max_error(test_y, pred_y) mean_absolute_error = skm.mean_absolute_error(test_y, pred_y) median_absolute_error = skm.median_absolute_error(test_y, pred_y) print("\n----------------- Model Summary ----------------") print("Type: {}\t\tEvaluation Criteria: {}".format(self.model_configs["type"], self.eval).expandtabs(15)) print("Response: {}\t\tAttributes: {}".format(self.target, ", ".join(list(self.attributes))).expandtabs(15)) print("Total Data Records: {}\t\tTraining Data Split: {}".format(self.data.shape[0], self.training_split).expandtabs(15)) print("Total Training Records: {}\t\tTotal Testing Records: {}".format(self.train_n, test_data.shape[0]).expandtabs(15)) print("R Squared: {}\t\tMean Squared Error: {}\t\tRoot Mean Squared Error: {}".format(round(self.r2,4), round(self.mse,4), round(self.rmse,4)).expandtabs(15)) print("Max Error: {}\t\tMean Absolute Error: {}\t\tMedian Absolute Error: {}".format( round(max_error, 4), round(mean_absolute_error, 4), round(median_absolute_error,4)).expandtabs(15)) def print_summary2(self): print(self.results.summary())
def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
X = vectorizedData[:, :-1] Y = vectorizedData[:, -1] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) baseModels = [('ridgeRegressor', linear_model.Ridge(alpha=0.01)), ('randomForestRegressor', RandomForestRegressor(max_depth=10, random_state=0, n_estimators=15, max_features=0.5)), ('supportVectorRegressor', svm.SVR(C=10, epsilon=0.5))] stackedRegressor = StackingRegressor(estimators=baseModels) stackedRegressor.fit(X_train, Y_train) trainingError = np.mean((stackedRegressor.predict(X_train) - Y_train)**2) print("Training Error: %.6f" % trainingError) Y_predict_unscaled = stackedRegressor.predict(X_test) testingError = np.mean((Y_predict_unscaled - Y_test)**2) print("Testing Error: %.6f" % testingError) meanScore = np.mean(imdbScores) standDeviation = np.std(imdbScores) Y_predict = Y_predict_unscaled * standDeviation + meanScore errorsAllowed = [ 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45, 1.5 ] predictionAccuracyList = [] for errorAllowed in errorsAllowed: