def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def lvl2_generate_prediction(rawdf, x_test, results_dir, lvl1_results_dir, type_, pp_choice, passthrough=False, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_names = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(f'{lvl1_results_dir}/results_store.pkl', 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} lvl1_pipeline = [ (model_name, Pipeline([ ('preprocess', pp_selector(pp_choice)), (model_name, model_object[model_name]) ]).set_params(**model_results[model_name].loc[0, 'params'])) for model_name in model_names] if type_ == 'lvl2_ridgecv': est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=RidgeCV(), passthrough=False) elif type_ == 'lvl2_xgb': if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), ('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) else: final_est = XGBRegressor() est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough) with open(f'{results_dir}/results_store.pkl', 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} #est.set_params( # **{f'final_estimator__{k}': v for k, v in model_results['lvl2ptvs_xgb'].loc[0, 'params'].items()}) est.set_params(**model_results['lvl2ptvs_xgb'].loc[0, 'params']) prediction = est.fit(x_train, y_train).predict(x_test) sub = pd.DataFrame() sub['Id'] = x_test['Id'] sub['SalePrice'] = prediction sub.to_csv(f'{results_dir}/{type_}_pp{pp_choice}_predictions.csv', index=False)