Example #1
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv,
                            passthrough=passthrough)
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    expected_column_count = 12 if passthrough else 2
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])

    reg.set_params(lr='drop')
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    expected_column_count_drop = 11 if passthrough else 1
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])
Example #2
0
def stacking(X, y, k_cv):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.001)),
                  ('svr', SVR(C=2000, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        reg.fit(X_trainval, y_trainval)
        print((reg.score(X_trainval, y_trainval))**0.5)
        test_pre = reg.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
        res.append(r_2(y_test, test_pre)**0.5)
        print("mean acacuracy: ", np.array(res).mean())
    print("mean acacuracy: ", np.array(res).mean())
Example #3
0
 def reg_ensemble_1(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     # el, el_pred = self.elastic_net_regr()
     # dt, dt_pred = self.decis_tree_regr()
     # knr, knr_pred = self.kneighbors_regr()
     # gbr, gbr_pred = self.gradient_boost_regr()
     estimators = [
         # ("str", dt),
         # ("eln", el),
         ("lasso", lasso),
         # ("knr", knr),
         # ("gbr", gbr),
         ("lr", lr),
         ("rf", rf)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
def main():
    data = pd.read_csv('dataset/complete.csv')
    data.drop("CountryCode", axis=1, inplace=True)
    data.drop("RegionName", axis=1, inplace=True)
    data.drop("RegionCode", axis=1, inplace=True)
    data.drop("M1_Wildcard", axis=1, inplace=True)

    # Remove Flag Columns
    for (colName, colData) in data.iteritems():
        if "flag" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
        if "index" in colName.lower():
            data.drop(colName, axis=1, inplace=True)

    # remove any rows that contain 'nan'
    data.dropna(axis=0, how='any', inplace=True)

    # change datatype of Date from int to DateTime64
    date_series = pd.to_datetime(data['Date'].astype(str), format='%Y-%m-%d')
    data['Date'] = date_series.map(dt.datetime.toordinal)
    # encoding country name
    data = pd.get_dummies(data, columns=['CountryName'],
                          prefix=['CountryName'])

    for (colName, colData) in data.iteritems():
        if "countryname" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
    print(data.info())

    # separate feature and label
    data_feature = data.drop(['ConfirmedCases', 'new_cases', 'ConfirmedDeaths'], axis=1, inplace=False)
    data_label_total_cases = data.loc[:, 'ConfirmedCases']
    data_label_total_deaths = data.loc[:, 'ConfirmedDeaths']
    data_label_cases_perDay = data.loc[:, 'new_cases']

    scaler = RobustScaler()
    features = scaler.fit_transform(data_feature)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        data_label_cases_perDay,
                                                        test_size=0.25,
                                                        random_state=42)

    estimators = [
        ('rfr', RandomForestRegressor(random_state=42, n_estimators=50)),
        ('gbr', GradientBoostingRegressor(random_state=42)),
        ('lsvr', LinearSVR(random_state=42, max_iter=1000)),
        ('etr', ExtraTreesRegressor(random_state=42, criterion='mae', n_estimators=50))
    ]

    model = StackingRegressor(
        estimators=estimators,
        final_estimator=ExtraTreesRegressor(random_state=42, n_estimators=50)
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE: " + str(mae))
Example #5
0
def test_stacking_regressor_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))],
                            final_estimator=rf,
                            cv=5)
    reg_drop = StackingRegressor(estimators=estimators,
                                 final_estimator=rf,
                                 cv=5)

    reg.fit(X_train, y_train)
    reg_drop.fit(X_train, y_train)
    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
def Stacked_Ensemble(x_train, x_test, y_train, y_test):

    # Path to save model
    path_to_model = os.path.join("model", "StackedEnsemble.sav")

    # define the base models
    level0 = list()
    level0.append(('lr', LinearRegression()))
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('adaboost', AdaBoostRegressor()))
    # level0.append(('bayes', ))

    # Classifier
    # level0.append(('lr', LogisticRegression()))
    # level0.append(('knn', KNeighborsClassifier()))
    # level0.append(('cart', DecisionTreeClassifier()))
    # level0.append(('svm', SVC()))
    # level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LinearRegression()

    # Classifier
    # level1 = LogisticRegression()

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

    model.fit(x_train, y_train)

    # Predicting
    y_pred = model.predict(x_test)

    # Printing the training results
    print("\n\n(Stacked Ensemble) Confusion Matrix: \n",
          confusion_matrix(y_true=y_test, y_pred=y_pred.round()))
    print("(Stacked Ensemble) Report: \n",
          classification_report(y_test, y_pred.round()))
    print("(Stacked Ensemble) Accuracy: \n",
          accuracy_score(y_test, y_pred.round()))

    # Saving the Model
    if not os.path.exists(os.path.dirname(path_to_model)):
        try:
            os.makedirs(os.path.dirname(path_to_model))
        except OSError as exc:  # Guard against race condition
            print("File does not exist !!!!")

    pickle.dump(model, open(path_to_model, 'wb'))

    return y_test, y_pred
Example #7
0
def stacking_qtlmas(X_trainval, y_trainval, X_test, y_test):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.005)),
                  ('svr', SVR(C=2500, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())

    reg.fit(X_trainval, y_trainval)
    print((reg.score(X_trainval, y_trainval))**0.5)
    test_pre = reg.predict(X_test)
    return test_pre
Example #8
0
def train(prop, k_fold=5, test_size=0.2):
    # 0.settings
    set_seed(GLOBAL_SEED)
    cv = k_fold  # cross-validation generator
    if cv == 1:
        cv = LeaveOneOut()

    # 1.basic learner nets
    knn = KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance')
    svr = GridSearchCV(SVR(), param_grid={"C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7)}, n_jobs=-1)
    ridge = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    mlp = MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700)
    rf = RandomForestRegressor()
    gbdt = GradientBoostingRegressor()
    # 2.metal model net
    metal_model = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    # 3.stacking model
    stacking_model = StackingRegressor(
        estimators=[('KNN', knn), ('SVR', svr), ('Ridge', ridge), ('MLP', mlp), ('RF', rf), ('GBDT', gbdt)],
        final_estimator=metal_model,
        n_jobs=-1, cv=cv  # cross validation
    )

    # 4.load data
    x, y = loadXY(config.data_load_path[prop])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True)

    # 5.train model(stacking模型,已经内置交叉验证)
    stacking_model.fit(x_train, y_train)
    # val-scores
    result = cross_validate(stacking_model, x_train, y_train, scoring=['neg_mean_absolute_error','neg_mean_squared_error','r2'], cv=cv)
    mae_val = result['test_neg_mean_absolute_error'].mean()
    mse_val = result['test_neg_mean_squared_error'].mean()
    r2_val = result['test_r2'].mean()
    # test-score
    pred = stacking_model.predict(x_test)
    mae_test = sklearn.metrics.mean_absolute_error(y_test, pred).mean()
    mse_test = sklearn.metrics.mean_squared_error(y_test, pred).mean()
    r2_test = sklearn.metrics.r2_score(y_test, pred).mean()
    # show
    print("验证集: MAE:%f, MSE:%f, R2:%f\n"
          "测试集: MAE:%f, MSE:%f, R2:%f"
          % (mae_val, mse_val, r2_val,
             mae_test, mse_test, r2_test))

    # 7.save model
    month_once_save_name = time.strftime('%Y-%m.pkl', time.localtime())
    save_path = os.path.join(config.model_save_path[prop], month_once_save_name)
    file_util.save_model(stacking_model, save_path)
Example #9
0
 def reg_ensemble_4(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     estimators = [
         ("lr", lr),
         ("rf", rf),
         ("lasso", lasso)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=200,
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
def Stacked_Ensemble(x_train, x_test, y_train, y_test):

    # define the base models
    level0 = list()
    level0.append(('lr', LinearRegression()))
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('adaboost', AdaBoostRegressor()))
    # level0.append(('bayes', ))

    # Classifier
    # level0.append(('lr', LogisticRegression()))
    # level0.append(('knn', KNeighborsClassifier()))
    # level0.append(('cart', DecisionTreeClassifier()))
    # level0.append(('svm', SVC()))
    # level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LinearRegression()

    # Classifier
    # level1 = LogisticRegression()

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

    model.fit(x_train, y_train)

    # Predicting
    y_pred = model.predict(x_test)

    # Printing the training results
    print("\n\n(Stacked Ensemble) Confusion Matrix: \n",
          confusion_matrix(y_true=y_test, y_pred=y_pred.round()))
    print("(Stacked Ensemble) Report: \n",
          classification_report(y_test, y_pred.round()))
    print("(Stacked Ensemble) Accuracy: \n",
          accuracy_score(y_test, y_pred.round()))

    return y_test, y_pred
def init_stacking(train_scaled, test_scaled, target, test_id):
    if not os.path.isfile('Data/pickles/models/pancake_stack'):

        estimators = [
            ('rfr', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                          max_depth=5, max_features='auto', max_leaf_nodes=None,
                                          max_samples=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, min_samples_leaf=4,
                                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                                          n_estimators=700, n_jobs=None, oob_score=True,
                                          random_state=None, verbose=3, warm_start=False)),

            ('xgboost', XGBRegressor(learning_rate=0.08, max_depth=3, n_estimators=500, n_jobs=-1,
                                     reg_alpha=0.001, reg_lambda=1, verbosity=2)),

            ('svr', SVR(C=5, cache_size=200, coef0=0.0, degree=1, epsilon=0.01, gamma='auto',
                        kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=3)),

            ('lgbm', LGBMRegressor(boosting_type='gbdt', lambda_l1=0,
                                   lambda_l2=0.1, learning_rate=0.1,
                                   max_depth=0, num_leaves=10))
        ]

        stack = StackingRegressor(estimators=estimators, final_estimator=LassoCV(cv=5), verbose=3)

        stack.fit(train_scaled, target)

        with open('Data/pickles/models/pancake_stack', 'wb') as file:
            pass
            pickle.dump(stack, file)

    else:
        with open('Data/pickles/models/pancake_stack', 'rb') as file:
            stack = pickle.load(file)

    y_pred = stack.predict(test_scaled)

    y_pred = np.exp(y_pred)

    submission_df = pd.DataFrame(y_pred, index=test_id, columns=['SalePrice'])

    submission_df.to_csv('Data/Submission/S6.csv')
Example #12
0
 def reg_ensemble_5(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     param = {'final_estimator__max_features': [1, 5],
              'final_estimator__n_jobs': [1, -1, 5]}
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     estimators = [
         ("lr", lr),
         ("rf", rf)
     ]
     # tss = TimeSeriesSplit(n_splits=2, test_size=10)
     tss = TimeSeriesSplit(gap=20, max_train_size=None, n_splits=10, test_size=None)
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=tss,
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Example #13
0
 def reg_ensemble_2(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     lor = LogisticRegression()
     # el, el_pred = self.elastic_net_regr()
     estimators = [
         # ("eln", el),
         ("lasso", lasso),
         ("lr", lr),
         ("rf", rf)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=5, #10
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
n_train = int(round(X.shape[0] * train_prct))

## Models
knn = KNeighborsRegressor(n_neighbors=5)
svm = SVR()
rf = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=0)
decision_tree = DecisionTreeRegressor(max_depth=3, max_features=2)
bayesian_ridge = BayesianRidge()

base_models = [("KNN", knn), ("SVM", svm), ("DecisionTree", decision_tree),
               ("RandomForest", rf)]

## Fit
stacked_learner = StackingRegressor(base_models, cv=N_FOLDS)
stacked_learner = stacked_learner.fit(X[:n_train], Y[:n_train])
y_pred_test = stacked_learner.predict(X[n_train:])
residuals_stacked = Y[n_train:] - y_pred_test
residuals_stacked_train = Y[:n_train] - stacked_learner.predict(X[:n_train])

adaboost = AdaBoostRegressor(n_estimators=100, loss="square", random_state=0)
adaboost = adaboost.fit(X[:n_train], Y[:n_train])
y_pred_test = adaboost.predict(X[n_train:])
residuals_adaboost = Y[n_train:] - y_pred_test
residuals_adaboost_train = Y[:n_train] - adaboost.predict(X[:n_train])

## Predict on entire dataset
y_pred = stacked_learner.predict(X)
df = pd.DataFrame.from_dict({
    "state": data.state,
    "population": data.population,
    "value": y_pred
                           max_features='sqrt',
                           max_depth=5,
                           oob_score=True)),
]

stack = StackingRegressor(estimators=estimators,
                          final_estimator=RandomForestRegressor(
                              n_estimators=1400,
                              min_samples_split=2,
                              min_samples_leaf=2,
                              max_features='sqrt',
                              max_depth=5,
                              oob_score=True))

stack.fit(Xtrainv, ytrainv)
stack_train_pred = stack.predict(Xtrainv)
stack_val_pred = stack.predict(Xtestv)
stack_test_pred = stack.predict(Xtest)

stack_train_mse = mean_squared_error(ytrainv, stack_train_pred)
stack_val_mse = mean_squared_error(ytestv, stack_val_pred)
stack_test_mse = mean_squared_error(ytest, stack_test_pred)

print("RMSE using StackRegressor:\t{}\t{}\t{}\n".format(
    np.sqrt(stack_train_mse), np.sqrt(stack_val_mse), np.sqrt(stack_test_mse)))

df_rf = pd.DataFrame({'Actual': ytest, 'Predicted': stack_test_pred})
fig1 = pp.figure(figsize=(8, 6))
df_rf.head(n=300).plot()
pp.legend()
pp.title("StackRegressor Actual v/s Predicted Annual Rainfall")
Example #16
0
#stacking regressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor

estimator = [('RF',
              RandomForestRegressor(random_state=4,
                                    n_estimators=700,
                                    min_samples_leaf=3,
                                    max_features=7,
                                    min_samples_split=15,
                                    warm_start=True)),
             ('KNN', KNeighborsRegressor(n_neighbors=7))]
ms = StackingRegressor(estimators=estimator,
                       final_estimator=LinearRegression()).fit(TrainX, TrainY)
Testpred = ms.predict(TestX)
Testpred = np.exp(Testpred)
STRMSE = np.sqrt(np.mean((TestY - Testpred)**2))

#out
FinalTest.drop(["Upvotes_log"], axis=1, inplace=True)
finalpred = ms.predict(FinalTest)
finalpred = np.exp(finalpred)
submission = pd.DataFrame({"ID": ids, "Upvotes": finalpred})
submission.to_csv("uppy03logtrGCVST.csv", index=False)

#grid search
#GridX,DX, GridY, DY=train_test_split(TrainAll[featuresnames], depdnt, train_size=0.10, random_state=4)
#
#from sklearn.model_selection import GridSearchCV
#parameters={"n_estimators":range(100,800,100), "min_samples_leaf":range(1,20,2), "min_samples_split":range(5,20,5), "max_features":range(1,5,1)}
    성능을 극으로 끌어올릴 때 활용한다.
    과대적합을 유발할 수 있다.(특히 데이터셋이 적은 경우)
    시간이 많이 소요된다.
"""
from sklearn.ensemble import StackingRegressor

stack_models = [
    ('elasticnet', poly_pipeline),
    ('randomforest', rfr),
    ('gbr', gbr),
    ('lgbm', lgbm),
]

stack_reg = StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1)
stack_reg.fit(x_train, y_train)
stack_pred = stack_reg.predict(x_test)
mse_eval('Stacking Ensemble', stack_pred, y_test)

## Weighted Blending
"""
각 모델의 예측값에 대하여 weight(가중치)를 곱하여 최종 output 계산
    모델에 대한 가중치를 조절하여, 최종 output을 산출한다.
    가중치의 합은 1.0이 되도록 한다.
"""

final_outputs = {
    'elasticnet': poly_pred,
    'randomforest': rfr_pred,
    'gbr': gbr_pred,
    'xgb': xgb_pred,
    'lgbm': lgbm_pred,
Example #18
0
# save the model to disk
filename = 'ensemble_model'
pickle.dump(model, open(filename, 'wb'))
 


# In[78]:

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
yhat = loaded_model.predict(test_X)


# In[79]:

yhat = model.predict(test_X)


# In[80]:

#Model Evaluation
print('R^2:',metrics.r2_score(test_y, yhat))
#print('Adjusted R^2:',1 - (1-metrics.r2_score(train_y, y_pred))*(len(train_y)-1)/(len(train_y)-train_X.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(test_y, yhat))
print('MAPE:',mean_absolute_percentage_error(test_y, yhat))
print('MSE:',metrics.mean_squared_error(test_y, yhat))
print('RMSE:',np.sqrt(metrics.mean_squared_error(test_y, yhat)))


# In[81]:
Example #19
0
rf = RandomForestRegressor(n_jobs=-1,
                           max_depth=75,
                           n_estimators=900,
                           random_state=0)

mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20),
                   activation='relu',
                   solver='lbfgs',
                   alpha=0.0001,
                   verbose=False,
                   max_iter=400)

stacking = StackingRegressor(estimators=[("mlp", mlp), ("randomForest", rf)],
                             n_jobs=-1)
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)
for i, y in enumerate(y_pred):
    if y_pred[i] < 0:
        y_pred[i] = 0
merged_pred = []
merged_pred.append(pd.Series(y_pred, name='pred_rf' + str(1)))
df_test = Reader.read_data('test.csv')
df_train = Reader.read_data('train.csv')
X, y = Reader.select_train_columns(df_train)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

df_test = Reader.select_train_columns(df_test)[0]
df_test['weather_4'] = 0
Example #20
0
    y_pred_et = regr_et.predict(x_val_scaled)
    rmse_et = np.sqrt(mean_squared_error(y_val_scaled, y_pred_et))
    all_rmse.iloc[i - 1, 3] = rmse_et

    # Gradient Boosting
    regr_gbr = GradientBoostingRegressor(n_estimators=100, random_state=0)
    regr.fit(x_train_scaled, y_train_scaled)
    y_pred_gbr = regr_gbr.predict(x_val_scaled)
    rmse_gbr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_gbr))
    all_rmse.iloc[i - 1, 4] = rmse_gbr

    # Stacking
    estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=0))]
    regr_sr = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=0))
    regr.fit(x_train_scaled, y_train_scaled)
    y_pred_sr = regr_sr.predict(x_val_scaled)
    rmse_sr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_sr))
    all_rmse.iloc[i - 1, 5] = rmse_sr

    # Voting
    r1 = LinearRegression()
    r2 = RandomForestRegressor(n_estimators=100, random_state=0)
    regr_vr = VotingRegressor([('lr', r1), ('rf', r2)])
    regr.fit(x_train_scaled, y_train_scaled)
    y_pred_vr = regr_vr.predict(x_val_scaled)
    rmse_vr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_vr))
    all_rmse.iloc[i - 1, 6] = rmse_vr

    # Histogram-based Gradient Boosting
    regr_hgbr = HistGradientBoostingRegressor(random_state=0)
    regr.fit(x_train_scaled, y_train_scaled)
Example #21
0
    def _BuildRegrModel(self, y, X):
        """Train an ensemble regression model and assess its performance.

        Start by splitting the y and X to train and test samples. Then create three regressors,
        namely a Random Forest, a Ridge and a SVM regressor and tune their hyperparameters using
        random search with cross validation. After updating their hyperparamters stack the three
        regressors using an ElasticNET linear regression model and fit the ensemble model to the 
        train sample. Finally, calculate its performance using the test sample and return
        both the ensemble model and the calculated metrics.

        Arguments:
            y {numpy.ndarray} -- The response variable (i.e. the LST data)
            X {numpy.ndarray} -- The explanatory variables (i.e. the LST predictors)

        Returns:
            sklearn.ensemble._stacking.StackingRegressor -- The ensemble regression model
            tuple -- A tuple with the regression performance metrics
        """

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.regr_test_size, random_state=self.SEED)

        regressors = [
            ("random forest",
             RandomForestRegressor(random_state=self.SEED,
                                   n_jobs=self.N_JOBS)),
            ("ridge", Ridge(random_state=self.SEED)),
            ("svr", SVR()),
        ]

        hyperparam_distributions = {
            "random forest": {
                "max_depth": stats.randint(5, 100),
                "n_estimators": stats.randint(30, 800),
                "min_samples_leaf": stats.randint(2, 20),
                "min_samples_split": stats.randint(2, 50),
            },
            "svr": {
                "kernel": ["rbf", "poly", "sigmoid", "linear"],
                "degree": stats.randint(2, 7),
                "epsilon": stats.uniform(0.05, 5.0),
                "C": stats.uniform(0.0, 25.0),
            },
            "ridge": {
                "alpha": stats.uniform(0.0001, 1.0)
            },
        }

        for name, regressor in regressors:
            print(f"{f'    Tuning the {name} hyperparameters...':<50}", end="")
            hyperparam_candidates = RandomizedSearchCV(
                regressor,
                param_distributions=[hyperparam_distributions[name]],
                scoring="r2",
                random_state=self.SEED,
                n_jobs=self.N_JOBS,
                n_iter=self.N_RANDOM_SEARCHES,
                verbose=0,
            ).fit(X_train, y_train)
            print(
                f"Done [CV R2 score = {hyperparam_candidates.best_score_:0.2f}]"
            )
            regressor.set_params(**hyperparam_candidates.best_params_)

        ensemble_regressor = StackingRegressor(
            regressors,
            final_estimator=ElasticNetCV(
                l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0],
                cv=10,
                n_jobs=self.N_JOBS,
                random_state=self.SEED,
            ),
            n_jobs=self.N_JOBS,
            passthrough=True,
        )

        try:
            ensemble_regressor.fit(X_train, y_train)
        except ValueError as err:
            raise ValueError(
                f"Error in _BuildRegrModel: Unable to fit ensemble regression model. {err}"
            )

        # Assess the model performance using the test data
        y_pred = ensemble_regressor.predict(X_test)

        #y_pred = regressors[1][1].predict(X_test)
        regr_metrics = (
            metrics.r2_score(y_test, y_pred),
            metrics.explained_variance_score(y_test, y_pred),
            metrics.max_error(y_test, y_pred),
            metrics.mean_absolute_error(y_test, y_pred),
            metrics.mean_squared_error(y_test, y_pred),
            metrics.median_absolute_error(y_test, y_pred),
        )

        return ensemble_regressor, regr_metrics
Example #22
0
from sklearn.linear_model import Lasso

estimators = [('forest',
               RandomForestRegressor(n_estimators=500, random_state=42)),
              ('lr', CatBoostRegressor(100)),
              ('xgb', xgboost.XGBRegressor(350))]
reg = StackingRegressor(estimators=estimators, final_estimator=Lasso())

reg.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error

mean_squared_error(model.predict(X_test), y_test)

submission = sample.copy()
submission['PE'] = reg.predict(test.drop(columns=['PE']))
submission.to_csv('sklearn_stack4.csv', index=True)

monitor = EarlyStopping(monitor='val_loss',
                        min_delta=1e-3,
                        patience=5,
                        verbose=1,
                        mode='auto',
                        restore_best_weights=True)
history2 = model2.fit(X_train,
                      y_train,
                      validation_data=(X_test, y_test),
                      epochs=1000,
                      callbacks=[monitor])

from keras.layers import Dense, Dropout
Example #23
0
                   verbose=False,
                   max_iter=400)
rf = RandomForestRegressor(n_jobs=-1,
                           max_depth=25,
                           n_estimators=900,
                           random_state=0)
# adaknn = AdaBoostRegressor(base_estimator=knn, random_state=0, n_estimators=9)
bagdt = BaggingRegressor(base_estimator=dt, n_estimators=300, random_state=0)
# rf.fit(X_train,y_train)
# pred=rf.predict(X_test)
# -------------------- Stacking voting -----------------------------
stacking = StackingRegressor(estimators=[('bagdt', bagdt), ("mlp", mlp),
                                         ("randomForest", rf)],
                             n_jobs=-1)
stacking.fit(X, y)
y_pred_stacking = stacking.predict(df_test)
print(y_pred_stacking)

# ------------------ Predict the registered ones -------------------------
# knn = KNeighborsRegressor(n_jobs=-1, n_neighbors=2, weights='distance', p=1)
dt = DecisionTreeRegressor(random_state=0)
mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20),
                   activation='relu',
                   solver='lbfgs',
                   alpha=0.0001,
                   verbose=False,
                   max_iter=400)
rf = RandomForestRegressor(n_jobs=-1,
                           max_depth=25,
                           n_estimators=900,
                           random_state=0)
Example #24
0
estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)),
              ('svr', SVR(C=1, gamma=1e-6))]

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=GradientBoostingRegressor(random_state=42))

from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

plt.figure()
plt.plot(y_test[:30], 'gd', label='Original')
plt.plot(y_pred[:30], 'b^', label='Stacking Regressor')
plt.show()

from sklearn.metrics import r2_score
print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))

#For multiple stacking layres
final_layer = StackingRegressor(estimators=[
    ('rf', RandomForestRegressor(random_state=42)),
    ('gbrt', GradientBoostingRegressor(random_state=42))
],
                                final_estimator=RidgeCV())
Example #25
0
import xgboost 



from catboost import CatBoostRegressor

knnr_scld =
(make_pipeline(StandardScaler(),


estimators = [
    ('forest',RandomForestRegressor(n_estimators=1000,random_state=42)),
    ('lr',  CatBoostRegressor(120)),
    ('xgb', xgboost.XGBRegressor(750))
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=)


reg.fit(X_train, y_train)

from  sklearn.metrics import mean_squared_error
mean_squared_error(reg.predict(X_test), y_test)

submission = sample.copy()
submission['PE'] = reg.predict(test.drop(columns = ['PE']))
submission.to_csv('sklearn_stack2.csv',index = True)


Example #26
0
#Step 1:Loading data
X, y = load_boston(return_X_y=True)

#Step 2:Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40)

#step3:Training
regression = StackingRegressor(estimators=[
    ('knn',
     KNeighborsRegressor(n_neighbors=4,
                         weights='distance',
                         leaf_size=1,
                         metric='manhattan')),
    ('dt', GradientBoostingRegressor(max_depth=3, n_estimators=220))
],
                               final_estimator=Ridge(random_state=40),
                               cv=5,
                               n_jobs=-1)
regression.fit(X_train, y_train)
score_train = regression.score(X_train, y_train)
score_test = regression.score(X_test, y_test)
pred_train = regression.predict(X_train)
pred_test = regression.predict(X_test)
rmse_train = np.sqrt(metrics.mean_squared_error(pred_train, y_train))
rmse_test = np.sqrt(metrics.mean_squared_error(pred_test, y_test))
print('RMSE:{:.2f}/{:.2f}'.format(rmse_train, rmse_test))
print('R2Score:{:.2f}/{:.2f}'.format(score_train, score_test))
Example #27
0
df['IsNew'] = df.YearBuilt.apply(lambda x: 1 if x > 2000 else 0)

df['IsOld'] = df.YearBuilt.apply(lambda x: 1 if x < 1946 else 0)

df.drop('MiscFeature', axis=1, inplace=True)

# ------------------------------- #

df['Age'] = df['YrSold'] - df['YearBuilt']

df['BsmtTotalBathRooms'] = df['BsmtFullBath'] + df['BsmtHalfBath']

df['AbvGradeTotalBathRooms'] = df['FullBath'] + df['HalfBath']

df['Total Rooms'] = df['BedroomAbvGr'] + df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath'] \
                    + df['TotRmsAbvGrd'] + df['KitchenAbvGr']

stack.fit(X, y)
test = scale.fit_transform(df[Importances.nlargest(int(best_col)).index])

pred = stack.predict(scale.fit_transform(test))

sub['SalePrice'] = pred
sub.to_csv('submission_2.csv', index=False)
# the score is around RMSE(0.3400) on Kaggle

# ------- Plot best cols ------- #
plt.figure(figsize=(20, 15))
Importances.nlargest(int(best_col)).plot(kind='barh')
plt.show()
Y = vectorizedData[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

baseModels = [('ridgeRegressor', linear_model.Ridge(alpha=0.01)),
              ('randomForestRegressor',
               RandomForestRegressor(max_depth=10,
                                     random_state=0,
                                     n_estimators=15,
                                     max_features=0.5)),
              ('supportVectorRegressor', svm.SVR(C=10, epsilon=0.5))]
stackedRegressor = StackingRegressor(estimators=baseModels)
stackedRegressor.fit(X_train, Y_train)
trainingError = np.mean((stackedRegressor.predict(X_train) - Y_train)**2)
print("Training Error: %.6f" % trainingError)
Y_predict_unscaled = stackedRegressor.predict(X_test)
testingError = np.mean((Y_predict_unscaled - Y_test)**2)
print("Testing Error: %.6f" % testingError)
meanScore = np.mean(imdbScores)
standDeviation = np.std(imdbScores)
Y_predict = Y_predict_unscaled * standDeviation + meanScore
errorsAllowed = [
    0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7,
    0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35,
    1.4, 1.45, 1.5
]
predictionAccuracyList = []
for errorAllowed in errorsAllowed:
    numRightPredicted = 0
Example #29
0
                           n_estimators=1000,
                           max_depth=7,
                           min_child_weight=1,
                           gamma=0,
                           subsample=0.95,
                           colsample_bytree=0.55,
                           reg_alpha=0.00001,
                           nthread=1,
                           seed=0)

reg = StackingRegressor(estimators=estims,
                        final_estimator=f_estimator,
                        passthrough=True)
reg.fit(X_train_betta_f, y_train_betta)

y_train_pred_betta = reg.predict(X_train_betta_f)
y_test_pred_betta = reg.predict(X_test_betta_f)

r2_train_betta = r2_score(y_train_betta, y_train_pred_betta)
rmse_train_betta = mean_squared_error(y_train_betta,
                                      y_train_pred_betta,
                                      squared=False)

print("R2: {0:.3f}, RMSE: {1:.5f}".format(
    r2_train_betta, rmse_train_betta))  # R2: 0.992, RMSE: 0.00543
# на прошлом стэкинге было R2: 0.978, RMSE: 0.00904

r2_test_betta = r2_score(y_test_betta, y_test_pred_betta)
rmse_test_betta = mean_squared_error(y_test_betta,
                                     y_test_pred_betta,
                                     squared=False)
Example #30
0
plt.title("Voting Ensemble Regression")
plt.legend()
plt.show()

# Heterogeneous Ensembles(Stacking)
models = [("LR", lr), ("DT", regr_tree), ("SVR", svr)]

# instead of choosing model weights, stacking uses a meta learner
# models training happens twice. once for base models, once for meta learner
meta_learner_reg = LinearRegression()

s_reg = StackingRegressor(estimators=models, final_estimator=meta_learner_reg)

s_reg.fit(x_train, y_train[:, 0])

y_pred = s_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Plot stacking regression prediction line over data
x_domain = np.linspace(min(x_train), max(x_train), 100)

y_pred_rescaled = y_scaler.inverse_transform(s_reg.predict(x_domain))
x_rescaled = x_scaler.inverse_transform(x_domain)

plt.figure()
plt.scatter(X, y)
plt.plot(x_rescaled, y_pred_rescaled, color='red', label='predictions')
plt.xlabel("LotArea in m$^2$")
plt.ylabel("SalePrice in ZAR")
plt.title("Stacking Ensemble Regression")