Example #1
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv,
                            passthrough=passthrough)
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    expected_column_count = 12 if passthrough else 2
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])

    reg.set_params(lr='drop')
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    expected_column_count_drop = 11 if passthrough else 1
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])
Example #2
0
def lvl2_generate_prediction(rawdf, x_test, results_dir, lvl1_results_dir, type_, pp_choice,
                             passthrough=False, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_names = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(f'{lvl1_results_dir}/results_store.pkl', 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}

    lvl1_pipeline = [
        (model_name, Pipeline([
            ('preprocess', pp_selector(pp_choice)),
            (model_name, model_object[model_name])
        ]).set_params(**model_results[model_name].loc[0, 'params']))
        for model_name in model_names]

    if type_ == 'lvl2_ridgecv':
        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=RidgeCV(), passthrough=False)
    elif type_ == 'lvl2_xgb':
        if passthrough:
            final_est = Pipeline([
                ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                        preprocess_pipeline=pp_selector(final_pp_choice),
                                                        no_of_lvl1=len(lvl1_pipeline))),
                ('debugger', DebuggerTransformer(info='final')),
                ('final_est', XGBRegressor())
            ])
        else:
            final_est = XGBRegressor()

        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)

        with open(f'{results_dir}/results_store.pkl', 'rb') as f:
            model_results = pickle.load(f)
        model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                         model_results.items()}
        #est.set_params(
        #    **{f'final_estimator__{k}': v for k, v in model_results['lvl2ptvs_xgb'].loc[0, 'params'].items()})
        est.set_params(**model_results['lvl2ptvs_xgb'].loc[0, 'params'])

    prediction = est.fit(x_train, y_train).predict(x_test)
    sub = pd.DataFrame()
    sub['Id'] = x_test['Id']
    sub['SalePrice'] = prediction
    sub.to_csv(f'{results_dir}/{type_}_pp{pp_choice}_predictions.csv', index=False)