Example #1
0
def lvl2_generate_prediction(rawdf, x_test, results_dir, lvl1_results_dir, type_, pp_choice,
                             passthrough=False, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_names = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(f'{lvl1_results_dir}/results_store.pkl', 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}

    lvl1_pipeline = [
        (model_name, Pipeline([
            ('preprocess', pp_selector(pp_choice)),
            (model_name, model_object[model_name])
        ]).set_params(**model_results[model_name].loc[0, 'params']))
        for model_name in model_names]

    if type_ == 'lvl2_ridgecv':
        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=RidgeCV(), passthrough=False)
    elif type_ == 'lvl2_xgb':
        if passthrough:
            final_est = Pipeline([
                ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                        preprocess_pipeline=pp_selector(final_pp_choice),
                                                        no_of_lvl1=len(lvl1_pipeline))),
                ('debugger', DebuggerTransformer(info='final')),
                ('final_est', XGBRegressor())
            ])
        else:
            final_est = XGBRegressor()

        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)

        with open(f'{results_dir}/results_store.pkl', 'rb') as f:
            model_results = pickle.load(f)
        model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                         model_results.items()}
        #est.set_params(
        #    **{f'final_estimator__{k}': v for k, v in model_results['lvl2ptvs_xgb'].loc[0, 'params'].items()})
        est.set_params(**model_results['lvl2ptvs_xgb'].loc[0, 'params'])

    prediction = est.fit(x_train, y_train).predict(x_test)
    sub = pd.DataFrame()
    sub['Id'] = x_test['Id']
    sub['SalePrice'] = prediction
    sub.to_csv(f'{results_dir}/{type_}_pp{pp_choice}_predictions.csv', index=False)
Example #2
0
    def test_compare_to_StackingRegressor(self, verbose=0, seed=42):
        """
        Determine if Ensemble with dummies correctly selects the real predictors and gives similar
        performance to scikit-learn StackingRegressor trained without dummies.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed)

        regressors = [LinearRegression(),
                       KNeighborsRegressor(),
                       RandomForestRegressor(random_state=seed)]
        dummy_regressors = [DummyRegressor(strategy='mean') for repeat in range(100)]
        all_regressors = regressors + dummy_regressors
        random.shuffle(all_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(all_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        mclf.fit([X], y)
        selected_regressors = mclf.get_model(1,0).get_base_models()
        self.assertTrue(len(selected_regressors) == 3,
                        'Ensemble picked the {} regressors instead of 3.'.format(len(selected_regressors)))
        self.assertFalse(DummyRegressor in [c.__class__ for c in selected_regressors],
                         'Ensemble chose a dummy regressors over a real one')

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        base_arg = [(str(i), c) for i, c in enumerate(regressors)]
        clf = StackingRegressor(base_arg, SVR(), cv=KFold(n_splits=3))
        sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))

        if verbose > 0:
            base_arg = [(str(i), c) for i, c in enumerate(all_regressors)]
            clf = StackingRegressor(base_arg, SVR(), cv=KFold(n_splits=3))
            sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))
            print('\nExplained variance scores')
            print('Ensemble informative predictors: {}'.format(pc_score_informative))
            print('Ensemble all predictors: {}'.format(pc_score_all))
            print('StackingRegressor informative predictors: {}'.format(sk_score_informative))
            print('StackingRegressor all predictors: {}'.format(sk_score_all))

        self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2),
                        'Ensemble accuracy is not same for all regressors and informative regressors.')
        tolerance_pct = 5
        self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0),
                        '''Ensemble with dummy regressors did not perform within accepted tolerance of StackingClassifier with no dummy regressors.''')
Example #3
0
 def reg_ensemble_1(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     # el, el_pred = self.elastic_net_regr()
     # dt, dt_pred = self.decis_tree_regr()
     # knr, knr_pred = self.kneighbors_regr()
     # gbr, gbr_pred = self.gradient_boost_regr()
     estimators = [
         # ("str", dt),
         # ("eln", el),
         ("lasso", lasso),
         # ("knr", knr),
         # ("gbr", gbr),
         ("lr", lr),
         ("rf", rf)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Example #4
0
def lvl2_xgb_randomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [
        (model_name,
         Pipeline([
             ('preprocess', preprocess_pipeline),
             (model_name, model_object[model_name])
         ])
         )
        for model_name in model_store]
    final_estimator_params = {'final_estimator__final_est__n_estimators': scipy.stats.randint(150, 1000),
                              'final_estimator__final_est__learning_rate': scipy.stats.uniform(0.01, 0.59),
                              'final_estimator__final_est__subsample': scipy.stats.uniform(0.3, 0.6),
                              'final_estimator__final_est__max_depth': scipy.stats.randint(1, 16),
                              'final_estimator__final_est__colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              'final_estimator__final_est__min_child_weight': [1, 2, 3, 4],
                              'final_estimator__final_est__gamma': scipy.stats.expon(scale=0.05),
                              }
    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            ('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
    else:
        final_est = XGBRegressor()

    est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)
    est = RandomizedSearchCV(est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(x_train, y_train)
    score = {'lvl2_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
Example #5
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv,
                            passthrough=passthrough)
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    expected_column_count = 12 if passthrough else 2
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])

    reg.set_params(lr='drop')
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    expected_column_count_drop = 11 if passthrough else 1
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])
Example #6
0
def stacking(X, y, k_cv):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.001)),
                  ('svr', SVR(C=2000, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        reg.fit(X_trainval, y_trainval)
        print((reg.score(X_trainval, y_trainval))**0.5)
        test_pre = reg.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
        res.append(r_2(y_test, test_pre)**0.5)
        print("mean acacuracy: ", np.array(res).mean())
    print("mean acacuracy: ", np.array(res).mean())
def stackingregressor(X, y):
    poly_pipeline = make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        ElasticNet(alpha=0.1, l1_ratio=0.2))
    poly_pipeline.fit(X, y)

    rfr = RandomForestRegressor()
    rfr.fit(X, y)

    gbr = GradientBoostingRegressor(random_state=42)
    gbr.fit(X, y)

    lgbm = LGBMRegressor(random_state=42)
    lgbm.fit(X, y)

    xgb = XGBRegressor(random_state=42)
    xgb.fit(X, y)

    stack_models = [
        ('elasticnet', poly_pipeline),
        ('randomforest', rfr),
        ('gbr', gbr),
        ('lgbm', lgbm),
    ]

    stack_reg = StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1)

    return stack_reg
 def build_stacker(self, train_x, train_y, test_x, test_y, params):
     """
     Build, fit and predict with a stacking regressor ensemble.
     :param train_x:
     :param train_y:
     :param test_x:
     :param test_y:
     :param params:
     :return:
     """
     # n_train_x = sk.preprocessing.scale(train_x, axis=1)
     if "estimators" in params.keys():
         estimators = []
         for e in params["estimators"]:
             # example estimator would be 'linear_model.RidgeCV', where the group and type must match the scikit-learn model
             sm = e.split(".")
             estimator = (sm[1], getattr(getattr(sk, sm[0]), sm[1]))
             estimators.append(estimator)
     else:
         estimators = [
             ('lr', sk.linear_model.LinearRegression()),
             # ('svr', sk.svm.LinearSVR(random_state=42)),
             ('enet', sk.linear_model.ElasticNetCV()),
             ('ridge', sk.linear_model.RidgeCV())
         ]
     self.model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(random_state=42),  passthrough=False, n_jobs=-1)
     self.results = self.model.fit(train_x, train_y)
     pred_y = self.results.predict(test_x)
     self.predictions = pred_y
     test_y = test_y.to_numpy().flatten()
     self.coef = None
     res = test_y - pred_y
     self.residuals = res
def model_to_test_reg():
    estimators = [
        ('dt', DecisionTreeRegressor()),
        ('las', LinearRegression())]
    stacking_regressor = StackingRegressor(
        estimators=estimators, final_estimator=LinearRegression())
    return stacking_regressor
def get_models(models=dict()):
    # models
    max_depth = 10
    n_estimators = 50
    models['LR'] = LinearRegression()
    models['RF'] = RandomForestRegressor(n_estimators=20,
                                         max_depth=10,
                                         random_state=0)
    models['ST'] = StackingRegressor(
        estimators=[
            ('rf',
             RandomForestRegressor(n_estimators=20,
                                   max_depth=10,
                                   random_state=0)),
            #('dt', DecisionTreeRegressor(max_depth=4)),
            ('lr', Lasso()),
            #('lr1', LinearRegression()),
        ],
        final_estimator=RandomForestRegressor(n_estimators=20,
                                              max_depth=4,
                                              random_state=0),
        passthrough=True)  #DecisionTreeRegressor(max_depth=max_depth)

    #models['GB'] = GradientBoostingRegressor(random_state=0, n_estimators=20, max_depth=10, learning_rate=0.6)
    #models['AB'] = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), random_state=0, n_estimators=n_estimators)
    #models['BG'] = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators, random_state=0)
    #models['DT'] = DecisionTreeRegressor(max_depth=max_depth, random_state=0)
    print('Defined %d models' % len(models))
    return models
def main():
    data = pd.read_csv('dataset/complete.csv')
    data.drop("CountryCode", axis=1, inplace=True)
    data.drop("RegionName", axis=1, inplace=True)
    data.drop("RegionCode", axis=1, inplace=True)
    data.drop("M1_Wildcard", axis=1, inplace=True)

    # Remove Flag Columns
    for (colName, colData) in data.iteritems():
        if "flag" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
        if "index" in colName.lower():
            data.drop(colName, axis=1, inplace=True)

    # remove any rows that contain 'nan'
    data.dropna(axis=0, how='any', inplace=True)

    # change datatype of Date from int to DateTime64
    date_series = pd.to_datetime(data['Date'].astype(str), format='%Y-%m-%d')
    data['Date'] = date_series.map(dt.datetime.toordinal)
    # encoding country name
    data = pd.get_dummies(data, columns=['CountryName'],
                          prefix=['CountryName'])

    for (colName, colData) in data.iteritems():
        if "countryname" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
    print(data.info())

    # separate feature and label
    data_feature = data.drop(['ConfirmedCases', 'new_cases', 'ConfirmedDeaths'], axis=1, inplace=False)
    data_label_total_cases = data.loc[:, 'ConfirmedCases']
    data_label_total_deaths = data.loc[:, 'ConfirmedDeaths']
    data_label_cases_perDay = data.loc[:, 'new_cases']

    scaler = RobustScaler()
    features = scaler.fit_transform(data_feature)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        data_label_cases_perDay,
                                                        test_size=0.25,
                                                        random_state=42)

    estimators = [
        ('rfr', RandomForestRegressor(random_state=42, n_estimators=50)),
        ('gbr', GradientBoostingRegressor(random_state=42)),
        ('lsvr', LinearSVR(random_state=42, max_iter=1000)),
        ('etr', ExtraTreesRegressor(random_state=42, criterion='mae', n_estimators=50))
    ]

    model = StackingRegressor(
        estimators=estimators,
        final_estimator=ExtraTreesRegressor(random_state=42, n_estimators=50)
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE: " + str(mae))
def stacking_regressor(estimators, final_estimator, data, labels, args={}):
    """
    Stacking算法:通过多个模型降低bias, 回归
    """
    from sklearn.ensemble import StackingRegressor
    reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, **args)
    reg.fit(data, labels)
    return reg
Example #13
0
def test_stacking_regressor_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))],
                            final_estimator=rf,
                            cv=5)
    reg_drop = StackingRegressor(estimators=estimators,
                                 final_estimator=rf,
                                 cv=5)

    reg.fit(X_train, y_train)
    reg_drop.fit(X_train, y_train)
    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
Example #14
0
    def stack_pipe(self, pipes, final_estimator):
        """Create a stacking ensemble pipe where individual pipes feed into
           a final stacking estimator model.

        Args:
            pipes (list): List of pipes that will have their outputs averaged
            final_estimator (sklearn.estimator): Estimator that will fit on model input predictions

        Returns:
            skklearn.StackingEstimator: Stacked estimator that will train on other model inputs
        """
        ests = []
        for i, p in enumerate(pipes):
            ests.append((f'stack_p{i}', p))

        if self.model_obj == 'reg':
            return StackingRegressor(pipes, final_estimator=final_estimator)

        if self.model_obj == 'class':
            return StackingRegressor(pipes, final_estimator=final_estimator)
def test_stacking_regressor(final_estimator):
    reg = StackingRegressor(estimators=[("svr", LinearSVR())],
                            final_estimator=final_estimator)
    html_output = estimator_html_repr(reg)

    assert str(reg.estimators[0][0]) in html_output
    assert "LinearSVR</label>" in html_output
    if final_estimator is None:
        assert "RidgeCV</label>" in html_output
    else:
        assert final_estimator.__class__.__name__ in html_output
def Stacked_Ensemble(x_train, x_test, y_train, y_test):

    # Path to save model
    path_to_model = os.path.join("model", "StackedEnsemble.sav")

    # define the base models
    level0 = list()
    level0.append(('lr', LinearRegression()))
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('adaboost', AdaBoostRegressor()))
    # level0.append(('bayes', ))

    # Classifier
    # level0.append(('lr', LogisticRegression()))
    # level0.append(('knn', KNeighborsClassifier()))
    # level0.append(('cart', DecisionTreeClassifier()))
    # level0.append(('svm', SVC()))
    # level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LinearRegression()

    # Classifier
    # level1 = LogisticRegression()

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

    model.fit(x_train, y_train)

    # Predicting
    y_pred = model.predict(x_test)

    # Printing the training results
    print("\n\n(Stacked Ensemble) Confusion Matrix: \n",
          confusion_matrix(y_true=y_test, y_pred=y_pred.round()))
    print("(Stacked Ensemble) Report: \n",
          classification_report(y_test, y_pred.round()))
    print("(Stacked Ensemble) Accuracy: \n",
          accuracy_score(y_test, y_pred.round()))

    # Saving the Model
    if not os.path.exists(os.path.dirname(path_to_model)):
        try:
            os.makedirs(os.path.dirname(path_to_model))
        except OSError as exc:  # Guard against race condition
            print("File does not exist !!!!")

    pickle.dump(model, open(path_to_model, 'wb'))

    return y_test, y_pred
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model
def get_stacking():
    """ Models used in stacking regressor"""
    level0 = list()
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('huber', HuberRegressor()))
    level0.append(('gbr', GradientBoostingRegressor()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=3)
    return model
Example #19
0
def train_stack_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
    order: Tuple[str, str] = ("rf", "lr"),
    lr_params: Optional[Dict]=None,
    rf_params: Optional[Dict]=None
) -> BaseEstimator:

    rf_estimator = RandomForestRegressor(
        n_estimators=1_000,
        criterion="mse",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )
    lr_estimator = LinearRegression()

    # Initialize GLM
    if order == ("rf", "lr"):
        stacking_regressor = StackingRegressor(
            estimators=[("Random Forest", rf_estimator)], final_estimator=lr_estimator
        )
    elif order == ("lr", "rf"):
        stacking_regressor = StackingRegressor(
            estimators=[("Linear Regression", lr_estimator)],
            final_estimator=rf_estimator,
        )
    else:
        raise ValueError()

    mo_regressor = MultiOutputRegressor(stacking_regressor, n_jobs=1)
    # train GLM
    t0 = time.time()
    mo_regressor.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_regressor
Example #20
0
    def get_pipe(self):

        try:
            pipe_n = len(self.pipelist)
            est_pipes = [(p[0], p[1]['pipe'](**p[1]['pipe_kwargs'])) for i, p in enumerate(self.pipelist)]
            final_e = self.stacker_estimator
            steps = [
                ('prep', MissingValHandler(prep_dict=self.prep_dict)),
                ('post',
                 make_pipeline(StackingRegressor(est_pipes, passthrough=False, final_estimator=final_e, n_jobs=1)))]
            return Pipeline(steps=steps)
        except:
            pass
Example #21
0
def test_multioutputregressor_ducktypes_fitted_estimator():
    """Test that MultiOutputRegressor checks the fitted estimator for
    predict. Non-regression test for #16549."""
    X, y = load_linnerud(return_X_y=True)
    stacker = StackingRegressor(estimators=[("sgd",
                                             SGDRegressor(random_state=1))],
                                final_estimator=Ridge(),
                                cv=2)

    reg = MultiOutputRegressor(estimator=stacker).fit(X, y)

    # Does not raise
    reg.predict(X)
Example #22
0
 def _get_stacker(self, mode, estimators, ensemble_config):
     if self.configs['fit']['train_mode'] == 'clf':
         stacker = StackingClassifier(
             estimators=estimators,
             final_estimator=self.get_base_estimator(
                 ensemble_config['model']),
             n_jobs=-1)
     elif self.configs['fit']['train_mode'] == 'reg':
         stacker = StackingRegressor(
             estimators=estimators,
             final_estimator=self.get_base_estimator(
                 ensemble_config['model']),
             n_jobs=-1)
     return stacker
Example #23
0
def stacking_qtlmas(X_trainval, y_trainval, X_test, y_test):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.005)),
                  ('svr', SVR(C=2500, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())

    reg.fit(X_trainval, y_trainval)
    print((reg.score(X_trainval, y_trainval))**0.5)
    test_pre = reg.predict(X_test)
    return test_pre
Example #24
0
def train(prop, k_fold=5, test_size=0.2):
    # 0.settings
    set_seed(GLOBAL_SEED)
    cv = k_fold  # cross-validation generator
    if cv == 1:
        cv = LeaveOneOut()

    # 1.basic learner nets
    knn = KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance')
    svr = GridSearchCV(SVR(), param_grid={"C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7)}, n_jobs=-1)
    ridge = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    mlp = MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700)
    rf = RandomForestRegressor()
    gbdt = GradientBoostingRegressor()
    # 2.metal model net
    metal_model = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    # 3.stacking model
    stacking_model = StackingRegressor(
        estimators=[('KNN', knn), ('SVR', svr), ('Ridge', ridge), ('MLP', mlp), ('RF', rf), ('GBDT', gbdt)],
        final_estimator=metal_model,
        n_jobs=-1, cv=cv  # cross validation
    )

    # 4.load data
    x, y = loadXY(config.data_load_path[prop])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True)

    # 5.train model(stacking模型,已经内置交叉验证)
    stacking_model.fit(x_train, y_train)
    # val-scores
    result = cross_validate(stacking_model, x_train, y_train, scoring=['neg_mean_absolute_error','neg_mean_squared_error','r2'], cv=cv)
    mae_val = result['test_neg_mean_absolute_error'].mean()
    mse_val = result['test_neg_mean_squared_error'].mean()
    r2_val = result['test_r2'].mean()
    # test-score
    pred = stacking_model.predict(x_test)
    mae_test = sklearn.metrics.mean_absolute_error(y_test, pred).mean()
    mse_test = sklearn.metrics.mean_squared_error(y_test, pred).mean()
    r2_test = sklearn.metrics.r2_score(y_test, pred).mean()
    # show
    print("验证集: MAE:%f, MSE:%f, R2:%f\n"
          "测试集: MAE:%f, MSE:%f, R2:%f"
          % (mae_val, mse_val, r2_val,
             mae_test, mse_test, r2_test))

    # 7.save model
    month_once_save_name = time.strftime('%Y-%m.pkl', time.localtime())
    save_path = os.path.join(config.model_save_path[prop], month_once_save_name)
    file_util.save_model(stacking_model, save_path)
Example #25
0
def test_stacking_regressor_sparse_passthrough(fmt):
    # Check passthrough behavior on a sparse X matrix
    X_train, X_test, y_train, _ = train_test_split(
        sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
        y_diabetes, random_state=42
    )
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    clf = StackingRegressor(
        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
    )
    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
    assert sparse.issparse(X_trans)
    assert X_test.format == X_trans.format
Example #26
0
def test_stacking_regression():
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_diabetes
    from sklearn.linear_model import RidgeCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import StackingRegressor

    X, y = load_diabetes(return_X_y=True)
    estimators = [('gbm',
                   xgb.sklearn.XGBRegressor(objective='reg:squarederror')),
                  ('lr', RidgeCV())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=RandomForestRegressor(
                                n_estimators=10, random_state=42))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    reg.fit(X_train, y_train).score(X_test, y_test)
Example #27
0
def grid_search_model():
    scorer = make_scorer(mean_squared_log_error)
    x, y = load_training_data(as_numpy=True)
    reg1 = LinearRegression()
    reg2 = KNeighborsRegressor()
    estimators = [("linear", reg1), ("neighbors", reg2)]
    reg3 = StackingRegressor(estimators=estimators,
                             passthrough=True,
                             final_estimator=RidgeCV())
    core = GradientBoostingRegressor(init=reg3)
    model = build_model_of(core)
    results = cross_val_score(model, x, y, scoring=scorer)
    t = np.average(results)
    print(t)
    model.fit(x, y)
    save_model(model, 'stacked-and-boosted')
    create_submission(model, 'stacked-and-boosted')
Example #28
0
def lvl2_ridgecv(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None, ):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    lvl1_pipeline = [
        (model_name,
         Pipeline([
             ('preprocess', pp_selector(pipeline_idx)),
             ('debugger', DebuggerTransformer(info='lvl1')),
             (model_name, model_object[model_name])
         ])
         )
        for model_name, pipeline_idx in zip(model_store, pp_choice)]

    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            ('debugger', DebuggerTransformer(info='final')),
            ('final_est', RidgeCV())
        ])
    else:
        final_est = RidgeCV()

    est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)
    score = cross_validate(est, x_train, y_train, cv=5, return_train_score=True,
                           scoring=make_scorer(rmsle, greater_is_better=False))

    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
Example #29
0
    def fit(self, X, y, random_state=None):
        """
        Train ENOLS on the given training set.

        Parameters
        ----------
        X: an input array of shape (n_sample, n_features)
        y: an array of shape (n_sample,) containing the classes for the input examples

        Return
        ------
        self: the fitted model
        """

        # use random instead of np.random to sample random numbers below
        random = check_random_state(random_state)

        estimators = [('lr', LinearRegression())]

        if isinstance(self.sample_size, int):
            self.sample_size = 'reservoir_sampling'

        # add all the trained OLS models to this list
        self.estimators_lr, self.estimators_TSR, self.estimators_enols = [], [], []
        for i in range(self.n_estimators):
            samples = sample_without_replacement(n_population=random.choice([50, 100]),
                                                 n_samples=random.choice([10, 20]),
                                                 random_state=random_state, method=self.sample_size)

            X_train, y_train = [], []
            for i in samples:
                X_train.append(X[i]), y_train.append(y[i])

            reg = LinearRegression()
            reg.fit(np.array(X_train), np.array(y_train))

            tsr = TheilSenRegressor()
            tsr.fit(np.array(X_train), np.array(y_train))

            enol = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
            enol.fit(np.array(X_train), np.array(y_train))

            self.estimators_lr.append(reg), self.estimators_TSR.append(tsr), self.estimators_enols.append(enol)

        return self
Example #30
0
 def get_pipe(self):
     try:
         pipe_n = len(self.pipelist)
         est_pipes = [(p[0], p[1]['pipe'](**p[1]['pipe_kwargs']))
                      for i, p in enumerate(self.pipelist)]
         final_e = self.stacker_estimator
         steps = [('prep', MissingValHandler(prep_dict=self.prep_dict)),
                  ('post',
                   make_pipeline(StackingRegressor(est_pipes,
                                                   passthrough=False,
                                                   final_estimator=final_e,
                                                   n_jobs=-1,
                                                   verbose=5),
                                 verbose=True))]
         return Pipeline(steps=steps)
     except Exception as e:
         print(f"Error: {e}")
         return None