print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

# define dataset
X, y = make_regression(n_samples=1000,
                       n_features=20,
                       n_informative=15,
                       noise=0.1,
                       random_state=1)
# define the base models
level0 = list()
level0.append(('knn', KNeighborsRegressor()))
level0.append(('cart', DecisionTreeRegressor()))
level0.append(('svm', SVR()))
# define meta learner model
level1 = LinearRegression()
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
# fit the model on all available data
model.fit(X, y)
# make a prediction for one example
data = [[
    0.59332206, -0.56637507, 1.34808718, -0.57054047, -0.72480487, 1.05648449,
    0.77744852, 0.07361796, 0.88398267, 2.02843157, 1.01902732, 0.11227799,
    0.94218853, 0.26741783, 0.91458143, -0.72759572, 1.08842814, -0.61450942,
    -0.69387293, 1.69169009
]]
yhat = model.predict(data)
print('Predicted Value: %.3f' % (yhat))
Example #2
0
    regr_mlp: MLPRegressor(random_state=random_state),
    regr_lin_svr: LinearSVR(epsilon=1.5, random_state=random_state),
    regr_ridge: Ridge(alpha=1, solver='cholesky'),
    # regr_lasso: Lasso(alpha=0.1),
    # regr_elastic_net: ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state),
    regr_adaboost: AdaBoostRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=10), n_estimators=200,
                                     learning_rate=0.5, random_state=random_state),
    regr_bagging: BaggingRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=10), n_estimators=100,
                                   max_samples=1.0, bootstrap=True,
                                   n_jobs=-1),
    regr_voting: VotingRegressor(
        estimators=[(regr_ridge, Ridge(alpha=1, solver='cholesky')), (regr_forest, RandomForestRegressor(n_jobs=-1)),
                    (regr_mlp, MLPRegressor())], n_jobs=-1),
    regr_grad_boost: GradientBoostingRegressor(n_estimators=150, random_state=random_state),
    regr_stacking: StackingRegressor(
        estimators=[(regr_ridge, Ridge(alpha=1, solver='cholesky')), (regr_lin_svr, LinearSVR(epsilon=1.5)),
                    (regr_mlp, MLPRegressor())],
        final_estimator=RandomForestRegressor(n_jobs=-1, n_estimators=10, random_state=11), n_jobs=-1)
}
regr_dict_regularized = {
    regr_forest: RandomForestRegressor(n_jobs=-1, random_state=random_state),
    regr_dtree: DecisionTreeRegressor(random_state=random_state, max_depth=3),
    regr_lin: LinearRegression(n_jobs=-1),
    regr_mlp: MLPRegressor(random_state=random_state),
    regr_lin_svr: LinearSVR(epsilon=1.5, random_state=random_state),
    regr_ridge: Ridge(alpha=1, solver='cholesky'),
    # regr_lasso: Lasso(alpha=0.1),
    # regr_elastic_net: ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state),
    regr_adaboost: AdaBoostRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=3), n_estimators=200,
                                     learning_rate=0.5, random_state=random_state),
    regr_bagging: BaggingRegressor(DecisionTreeRegressor(random_state=random_state, max_depth=3), n_estimators=100,
                                   max_samples=1.0, bootstrap=True,
def baseline(showPlot):
    np.set_printoptions(precision=3, suppress=True)

    full_df = pd.read_csv(
        '../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_lin_int.csv',
        infer_datetime_format=True,
        parse_dates=True)

    #=========================FIND BEST OFFSET========================================

    by_state = full_df['sub_region_1'].unique()
    linear_scores_by_state = {}
    lin_avg = 0
    log_scores_by_state = {}
    log_avg = 0
    lin_corr_avg = 0
    log_corr_avg = 0

    for region in by_state:

        temp = full_df.loc[(full_df['sub_region_1'] == region)]

        bestLinearCorr = 0
        bestLogCorr = 0
        bestLinearOffset = -1
        bestLogOffset = -1
        bestLinearData = 0
        bestLogData = 0

        correlationScores = []
        correlationLogScores = []

        for offset in range(30):
            #Shift CDC data by offset value - this is going to create some problems because we'll have to do if for each state...
            cdc_dataframe = temp['num_cases'].shift(periods=offset,
                                                    fill_value=0)

            #Build new full data array
            mobility_dataframe = temp.drop(
                columns=['date', 'sub_region_1', 'num_cases'])
            full_dataframe = pd.concat([cdc_dataframe, mobility_dataframe],
                                       axis=1)
            #full_dataframe['originalCases'] = temp['num_cases'] #preserve original case values as additional feature
            full_dataframe = full_dataframe.loc[(
                full_dataframe['num_cases'] !=
                0)]  #remove rows with zero cases

            #Compute linear and logatrithmic correlations
            linearCorr = full_dataframe.corr()
            linearCorr = linearCorr.to_numpy()[
                0,
                1:]  #Take only correlations between 'cases' and mobility data

            logData = np.log(full_dataframe + 1 -
                             np.min(full_dataframe.to_numpy()))
            logCorr = logData.corr()
            logCorr = logCorr.to_numpy()[
                0,
                1:]  #Take only correlations between 'cases' and mobility data

            #print("Offset:", offset, "Correlation:    ", linearCorr)
            #print("           Log Correlation:", logCorr)

            #Save best values
            if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr):
                bestLinearCorr = linearCorr
                bestLinearOffset = offset
                bestLinearData = full_dataframe

            if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr):
                bestLogCorr = logCorr
                bestLogOffset = offset
                bestLogData = logData

            correlationScores.append(np.linalg.norm(linearCorr))
            correlationLogScores.append(np.linalg.norm(logCorr))

        if showPlot:
            plt.plot(correlationScores)
            plt.xlabel("Cases offset (days)")
            plt.ylabel("Norm of correlation vector")
            plt.title("Linear correlation vs. data offset")
            plt.show()
            plt.plot(correlationLogScores)
            plt.xlabel("Cases offset (days)")
            plt.ylabel("Norm of correlation vector")
            plt.title("Logarithmic correlation vs. data offset")
            plt.show()

        print("Best Full Correlation:", bestLinearCorr)
        print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr))
        print("Best Full Offset:", bestLinearOffset)

        print("Best Log Correlation:", bestLogCorr)
        print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr))
        print("Best Log Offset:", bestLogOffset)

        linear_scores_by_state[region] = bestLinearOffset
        log_scores_by_state[region] = bestLogOffset
        lin_avg += bestLinearOffset
        log_avg += bestLogOffset
        lin_corr_avg += np.linalg.norm(bestLinearCorr)
        log_corr_avg += np.linalg.norm(bestLogCorr)
    print(linear_scores_by_state)
    print(log_scores_by_state)
    print(lin_avg / len(by_state))
    print(log_avg / len(by_state))
    print(lin_corr_avg / len(by_state))
    print(log_corr_avg / len(by_state))

    bestLinearOffset = lin_avg // len(by_state)
    bestLogOffset = log_avg // len(by_state)

    linearMSE_by_state = []
    logMSEAdj_by_state = []
    linearCasesMSE_by_state = []
    logCasesMSE_by_state = []
    logisticMSE_by_state = []
    dataNoise_by_state = []
    arimaMSE_by_state = []
    gaussMSE_by_state = []
    for s in range(len(by_state)):

        #=========================BEGIN MODEL FITTING========================================

        #Get the data for that state and shift it
        bestLinearData = pd.DataFrame()
        bestLogDf = pd.DataFrame()
        temp = full_df.loc[(full_df['sub_region_1'] == by_state[s])]
        temp = temp.loc[(temp['date'] < '2020-11-30')]
        #Shift CDC data by offset value
        cdc_lin_dataframe = temp['num_cases'].shift(periods=bestLinearOffset,
                                                    fill_value=0)
        mobility_lin_dataframe = temp.drop(
            columns=['date', 'sub_region_1', 'num_cases'])
        all_lin_states = pd.concat([cdc_lin_dataframe, mobility_lin_dataframe],
                                   axis=1)
        all_lin_states = all_lin_states.loc[(all_lin_states['num_cases'] >
                                             0)]  #remove rows with zero cases
        bestLinearData = bestLinearData.append(all_lin_states)
        #Shift CDC data by offset value
        cdc_log_dataframe = temp['num_cases'].shift(periods=bestLogOffset,
                                                    fill_value=0)
        mobility_log_dataframe = temp.drop(
            columns=['date', 'sub_region_1', 'num_cases'])
        all_log_states = pd.concat([cdc_log_dataframe, mobility_log_dataframe],
                                   axis=1)
        all_log_states = all_log_states.loc[(all_log_states['num_cases'] >
                                             0)]  #remove rows with zero cases
        bestLogDf = bestLogDf.append(all_log_states)
        bestLogData = np.log(bestLogDf + 1 - np.min(bestLogDf.to_numpy()))

        linearMSE = []
        logMSEAdj = []
        linearCasesMSE = []
        logCasesMSE = []
        logisticMSE = []
        dataNoise = []
        arimaMSE = []
        gaussMSE = []

        #Convert data to numpy
        linearCasesOnly = bestLinearData['num_cases'].to_numpy()
        logCasesOnly = np.log(linearCasesOnly + 1)
        bestLinearData = bestLinearData.to_numpy()
        bestLogData = bestLogData.to_numpy()

        stride = 3  #trains a new model every {stride} days
        maxEpoch = 100

        for t in range(
            (min(bestLinearData.shape[0], bestLogData.shape[0]) - 90) //
                stride):
            print("Training model:", t)
            print("State:", by_state[s])

            #Linear Mobility Data
            linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:]
            linearTrainy = bestLinearData[t * stride:t * stride + 60, :1]
            linearTestX = bestLinearData[t * stride + 60:t * stride + 90, 1:]
            linearTesty = bestLinearData[t * stride + 60:t * stride + 90, :1]

            #Logarithmic Mobility Data
            logTrainX = bestLogData[t * stride:t * stride + 60, 1:]
            logTrainy = bestLogData[t * stride:t * stride + 60, :1]
            logTestX = bestLogData[t * stride + 60:t * stride + 90, 1:]
            logTesty = bestLogData[t * stride + 60:t * stride + 90, :1]

            #Cases-only data
            linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60]
            logCasesTrainX = logCasesOnly[t * stride:t * stride + 60]
            linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride + 90]
            logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 90]

            timeTrain = np.arange(1, 61).reshape(-1, 1)
            timeTest = np.arange(61, 91).reshape(-1, 1)

            #Uncomment to add time data to mobility dataset
            #linearTrainX = np.hstack((linearTrainX, timeTrain))
            #logTrainX = np.hstack((logTrainX, timeTrain))
            #linearTestX = np.hstack((linearTestX, timeTest))
            #logTestX = np.hstack((logTestX, timeTest))

            #fit linear model
            linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy)

            predict = linear_model.predict(linearTestX)
            linearMSE.append(np.abs(predict - linearTesty) / linearTesty)

            #fit log model
            linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy)

            predict = linear_model.predict(logTestX)
            predictAdj = np.exp(predict) - 1 + np.min(full_dataframe.to_numpy(
            ))  #convert from log back to raw case number
            logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty)

            #fit linear cases only model
            cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX)
            if showPlot:
                visualize_cases(cases_model, timeTrain, linearCasesTrainX,
                                timeTest, linearCasesTestX)

            predict = cases_model.predict(timeTest)
            linearCasesMSE.append(
                np.abs(predict - linearCasesTestX) / linearCasesTestX)

            #fit log cases only model
            cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX)
            if showPlot:
                visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX,
                                np.log(timeTest), logCasesTestX)

            predict = cases_model.predict(np.log(timeTest))
            predictAdj = np.exp(
                predict) - 1  #convert from log back to raw case number
            logCasesMSE.append(
                np.abs(predictAdj - linearCasesTestX) / linearCasesTestX)

            #fit logistic model
            logistic_model, cov = optimize.curve_fit(
                logisticDerivative,
                timeTrain.reshape(linearCasesTrainX.shape),
                linearCasesTrainX,
                p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30],
                maxfev=10000,
                bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf,
                                                       np.Inf])))
            if showPlot:
                visualize_logistic(logistic_model, timeTrain,
                                   linearCasesTrainX, timeTest,
                                   linearCasesTestX)

            predictLogistic = logisticDerivative(
                timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])
            logisticMSE.append(
                np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX)

            predict = logisticDerivative(
                timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])
            dataNoise.append(
                np.mean(
                    np.abs(predict - linearCasesTrainX) / linearCasesTrainX))

            #fit stacking regressor
            estimators = [('lr', RidgeCV()),
                          ('svr', LinearSVR(random_state=42),
                           ('rf',
                            RandomForestClassifier(n_estimators=10,
                                                   random_state=42)))]
            reg = StackingRegressor(estimators=estimators,
                                    final_estimator=GaussianProcessRegressor(
                                        kernel=DotProduct() + WhiteKernel(),
                                        random_state=0))
            stacking_model = reg.fit(timeTrain, linearCasesTrainX)
            if showPlot:
                visualize_cases(stacking_model, timeTrain, linearCasesTrainX,
                                timeTest, linearCasesTestX)

            predict = stacking_model.predict(timeTest)
            linearCasesMSE.append(
                np.abs(predict - linearCasesTestX) / linearCasesTestX)

            #fit ARIMA
            #Perform grid search to determine ARIMA Order
            '''stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1, 
                            max_p = 3, max_q = 3, m = 7, 
                            start_P = 0, seasonal = True, 
                            d = None, D = 1, trace = True, 
                            error_action ='ignore',   # we don't want to know if an order does not work 
                            suppress_warnings = True,  # we don't want convergence warnings 
                            stepwise = True)           # set to stepwise 
            stepwise_fit.summary()'''

            model = SARIMAX(linearCasesTrainX,
                            initialization='approximate_diffuse',
                            order=(2, 0, 0),
                            seasonal_order=(2, 1, 0, 7))

            result = model.fit(disp=False)
            if False:
                visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest,
                                linearCasesTestX)

            predictArima = result.predict(61, 90, typ='levels')
            arimaMSE.append(
                np.abs(predictArima - linearCasesTestX) / linearCasesTestX)

            #Evaluate other models to use as input to gaussian process
            arima1 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(2, 0, 0),
                             seasonal_order=(2, 1, 0, 7)).fit(disp=False)
            arima2 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(2, 0, 0),
                             seasonal_order=(2, 1, 1, 7)).fit(disp=False)
            arima3 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(1, 1, 0),
                             seasonal_order=(1, 1, 1, 7)).fit(disp=False)
            arima4 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(0, 1, 1),
                             seasonal_order=(1, 1, 1, 7)).fit(disp=False)
            arima5 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(0, 1, 1),
                             seasonal_order=(2, 1, 0, 7)).fit(disp=False)

            predictLog = cases_model.predict(np.log(timeTrain))  #Log model
            predictAdj = np.exp(
                predictLog) - 1  #convert from log back to raw case number
            predictLogistic = logisticDerivative(
                timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])  #logistic model
            predictArima1 = arima1.predict(1, 60, typ='levels')
            predictArima2 = arima2.predict(1, 60, typ='levels')
            predictArima3 = arima3.predict(1, 60, typ='levels')
            predictArima4 = arima4.predict(1, 60, typ='levels')
            predictArima5 = arima5.predict(1, 60, typ='levels')

            testLog = cases_model.predict(np.log(timeTest))  #Log model
            testAdj = np.exp(
                testLog) - 1  #convert from log back to raw case number
            testLogistic = logisticDerivative(
                timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])  #logistic model
            testArima1 = arima1.predict(61, 90, typ='levels')
            testArima2 = arima2.predict(61, 90, typ='levels')
            testArima3 = arima3.predict(61, 90, typ='levels')
            testArima4 = arima4.predict(61, 90, typ='levels')
            testArima5 = arima5.predict(61, 90, typ='levels')

            #fit gaussian process meta-learner
            gaussTrain = np.array([
                predictLogistic, predictArima1, predictArima2, predictArima3,
                predictArima4, predictArima5
            ]).T
            gaussTest = np.array([
                testLogistic, testArima1, testArima2, testArima3, testArima4,
                testArima5
            ]).T
            reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(),
                                           random_state=0)
            stacking_model = reg.fit(gaussTrain, linearCasesTrainX)
            predictTrain = stacking_model.predict(gaussTrain)
            predictTest = stacking_model.predict(gaussTest)
            if showPlot:
                visualize_gauss(
                    np.hstack((predictTrain, predictTest)).T, timeTrain,
                    linearCasesTrainX, timeTest, linearCasesTestX)

            gaussMSE.append(
                np.abs(predictTest - linearCasesTestX) / linearCasesTestX)

        #Append to state totals
        linearMSE_by_state.append(
            np.reshape(np.array(linearMSE).mean(axis=0), (30)))
        logMSEAdj_by_state.append(
            np.reshape(np.array(logMSEAdj).mean(axis=0), (30)))
        linearCasesMSE_by_state.append(
            np.reshape(np.array(linearCasesMSE).mean(axis=0), (30)))
        logCasesMSE_by_state.append(
            np.reshape(np.array(logCasesMSE).mean(axis=0), (30)))
        logisticMSE_by_state.append(
            np.reshape(np.array(logisticMSE).mean(axis=0), (30)))
        dataNoise_by_state.append(np.mean(dataNoise))
        arimaMSE_by_state.append(
            np.reshape(np.array(arimaMSE).mean(axis=0), (30)))
        gaussMSE_by_state.append(
            np.reshape(np.array(gaussMSE).mean(axis=0), (30)))
        print("Average logistic Test error:", np.mean(dataNoise))

    #Plot proof-of-concept graph
    if showPlot:
        plt.plot(np.array(linearMSE_by_state).mean(axis=0),
                 label='Mobility (linear, non-temporal)')
        plt.plot(np.array(logMSEAdj_by_state).mean(axis=0),
                 label='Mobility (logarithmic, non-temporal)')
        plt.xlabel("Days in advance to predict")
        plt.ylabel("Percent deviation from true value")
        plt.legend(loc="upper left")
        plt.show()

        #Plot baseline graph
        plt.plot(np.array(linearCasesMSE_by_state).mean(axis=0),
                 label='Cases (linear, temporal)'
                 )  #Don't plot because performance is terrible
        plt.plot(np.array(logCasesMSE_by_state).mean(axis=0),
                 label='Cases (logarithmic temporal)')
        plt.plot(np.array(logisticMSE_by_state).mean(axis=0),
                 label='Cases (logistic temporal)')
        plt.plot(np.array(arimaMSE_by_state).mean(axis=0),
                 label='Cases (ARIMA)')
        plt.plot(np.array(gaussMSE_by_state).mean(axis=0),
                 label='Cases (Gaussian Process meta)')
        plt.xlabel("Days in advance to predict")
        plt.ylabel("Percent deviation from true value")
        plt.legend(loc="upper left")
        plt.show()
    print("Average logistic test error:", np.mean(dataNoise_by_state))
Example #4
0
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))
#X["LON"]=(X["LON"]-mu_lon)/std_lon

X=pd.concat([X["AGE"],X["LAT"],X["LON"],gender,marital,ethnicity,race,reasoncode],axis=1)
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,test_size=0.3,random_state=123)

from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

#Create an SVR and a RidgeCV model and stack them up to get more accuracy
estimators = [('lr', RidgeCV()),('svr', LinearSVR(random_state=42))]

#Train the model on training data
reg = StackingRegressor(estimators=estimators,final_estimator=RandomForestRegressor(n_estimators=20,random_state=42))
reg.fit(train_X,train_Y)

Test the model 
y_pred = reg.predict(test_X)
np.sqrt(MSE(test_Y, y_pred))

#Read the careplan data and clean the data

careplan=pd.read_csv("careplans.csv")
careplan=careplan.dropna(subset=["REASONCODE"])
patients.rename(columns = {'Id':'PATIENT'}, inplace = True) 
data_1=pd.merge(careplan,patients,how='left',on='PATIENT')
data_1["START"]=pd.to_datetime(data_1["START"].str[:10])
data_1["BIRTHDATE"]=pd.to_datetime(data_1["BIRTHDATE"])
data_1["AGE"]=(data_1["START"]-data_1["BIRTHDATE"]).dt.days/365
Example #6
0
def get_model(data, target, use_ensemble=True):

    params1 = {
        'el__alpha': np.logspace(-5, 2, 30),
        'el__l1_ratio': np.linspace(0, 1, 3),
        'pca__n_components': [2, 5, 10]
    }

    params2 = {
        'rf__n_estimators': range(10, 101, 30),
        'rf__max_depth': [2, 5, 9],
        'pca__n_components': [2, 5, 10]
    }

    params3 = {
        'lgb__learning_rate': np.logspace(-6, 0, 5),
        'lgb__n_estimators': range(10, 101, 30),
        'lgb__max_depth': [6, 9, 12],
        'pca__n_components': [2, 5, 10],
        'lgb__num_leaves': [100]
    }

    rf = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('rf', RandomForestRegressor())])
    el = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('el', ElasticNet(max_iter=5000))])
    lgb = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                    ('lgb', LGBMRegressor())])

    gr_lgb = GridSearchCV(lgb,
                          params3,
                          cv=TimeSeriesSplit(),
                          scoring='neg_mean_squared_error',
                          refit=True)
    gr_lgb.fit(data, target)
    logger.info('Booster params discovered')

    gr_el = GridSearchCV(el,
                         params1,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_el.fit(data, target)
    logger.info('ElasticNet params discovered')

    gr_rf = GridSearchCV(rf,
                         params2,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_rf.fit(data, target)
    logger.info('RandomForest params discovered')

    res_scores = {
        'elastic': gr_el.best_score_,
        'random_forest': gr_rf.best_score_,
        'lgbm': gr_lgb.best_score_
    }

    res_est = {
        'elastic': gr_el.best_estimator_,
        'random_forest': gr_rf.best_estimator_,
        'lgbm': gr_lgb.best_estimator_
    }
    if use_ensemble:
        estimators = [('elastic', gr_el.best_estimator_),
                      ('random_forest', gr_rf.best_estimator_),
                      ('lgbm', gr_lgb.best_estimator_)]

        stacked = StackingRegressor(estimators=estimators,
                                    final_estimator=RandomForestRegressor(
                                        n_estimators=100, max_depth=3),
                                    passthrough=True)
        stacked.fit(data, target)
        logger.info('Ensemble fitted')
        return stacked
    return res_est[sorted(res_scores, key=lambda x: (-res_scores[x], x))[0]]
Example #7
0
def test_stacking_regressor_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))],
                            final_estimator=rf,
                            cv=5)
    reg_drop = StackingRegressor(estimators=estimators,
                                 final_estimator=rf,
                                 cv=5)

    reg.fit(X_train, y_train)
    reg_drop.fit(X_train, y_train)
    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
Example #8
0
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor

import sklearn.linear_model as sl
models = { "LinearRegression": sl.LinearRegression(),
           "LassoCV" : sl.LassoCV(),
           "ElasticNetCV" : sl.ElasticNetCV(alphas=np.linspace(0.038,0.1,100)),
           "StackingRegressor" : StackingRegressor([('SGD',sl.SGDRegressor()),('GBR', GradientBoostingRegressor())],verbose=1),
           "RidgeCV" : sl.RidgeCV(),
           "SGDRegressor" : sl.SGDRegressor(),
           "Perceptron" : sl.Perceptron()
}

def read_data(train_file, test_file): 
    full_data = pd.read_csv(train_file)
    data_sub = pd.read_csv(test_file)
    full_data["User"] = full_data["User"]-1
    full_data["Movie"] = full_data["Movie"]-1
    data_sub["User"] = data_sub["User"]-1
    data_sub["Movie"] = data_sub["Movie"]-1
    return full_data, data_sub
Example #9
0
rf = RandomForestRegressor(
    n_estimators=400, random_state=SEED)
# lasso = Lasso(random_state=SEED)
ridge = BayesianRidge()
svr = SVR()
knn = KNeighborsRegressor(27)
dt = DecisionTreeRegressor(max_depth=32)
gbdt = GradientBoostingRegressor(n_estimators=400, random_state=SEED)

base_models = [("lr", lr), ("rf", rf),
               ("ridge", ridge), ("dt", dt),
               ("svr", svr), ("knn", knn),
               ("gbdt", gbdt)]  # , ("mlp", mlp)]
meta_model = LinearRegression()

stacked = StackingRegressor(estimators=base_models,
                            final_estimator=meta_model, n_jobs=6, verbose=2)
try:
    with open("model.pickle", "rb") as m:
        stacked = pickle.load(m)
except FileNotFoundError:
    pass

print("Stacked model baseline RMSE: ", rmse_cv(
    stacked, X_train, y_train,  cv=N_FOLD))


# %%hyper-parameter optimization
# param_grid = {
#     "rf__min_samples_split": [3, 12],
#     # "rf__n_estimators": [100, 400],
#     #   "gbdt__n_estimators": [100, 400],
Example #10
0
def model(xTrain, yTrain, xTest, yTest):
    kfold = StratifiedKFold(n_splits=3)

    random_state = 3
    classifiers = []
    classifiers.append(SVR(C=1.0, epsilon=0.2))
    classifiers.append(DecisionTreeRegressor(random_state=random_state))
    classifiers.append(
        RandomForestRegressor(random_state=random_state, n_estimators=100))
    classifiers.append(GradientBoostingRegressor(random_state=random_state))
    classifiers.append(KNeighborsRegressor())

    cv_results = []
    for classifier in classifiers:
        cv_results.append(
            cross_validate(classifier,
                           xTrain,
                           yTrain,
                           scoring='neg_mean_squared_error',
                           cv=3))

    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(abs(statistics.mean(cv_result['test_score'])))
        cv_std.append(statistics.stdev(cv_result['test_score']))

    cv_res = pd.DataFrame({
        "Mean_Squared_Errors":
        cv_means,
        "Algorithm": [
            "SVR", "Decision Tree", "RandomForest", "GradientBoosting",
            "KNeighboors"
        ]
    })

    #print(cv_res)

    cv_res.plot(kind='bar', x='Algorithm', y='Mean_Squared_Errors')
    plt.show()
    '''
    # Working with BEST
    
    classifiersBest = []
    
    
    bestRFR = RandomForestRegressor(random_state=random_state, max_depth = 10, max_features = 25, min_samples_leaf = 1, n_estimators=100)
    
    bestDTR = DecisionTreeRegressor(random_state=random_state, max_depth = 50, max_features = 25, min_samples_leaf = 1)
    
    bestGBR = GradientBoostingRegressor(random_state=random_state, learning_rate = 0.01, max_depth = 3, max_features = 25, min_samples_leaf = 5)
    
    
    
    # SHOULD GO HERE
    
    bestRFR.fit(xTrain,yTrain)
    bestDTR.fit(xTrain,yTrain)
    bestGBR.fit(xTrain,yTrain)
    
    # added here
    
    classifiersBest.append(bestRFR)
    classifiersBest.append(bestDTR)
    classifiersBest.append(bestGBR)
    
    
    # stacking
    
    estimators = [('RFR', bestRFR), ('DTR', bestDTR), ('GBR', bestGBR)]
    
    reg = StackingRegressor(estimators = estimators, final_estimator=RandomForestRegressor(random_state=random_state, n_estimators=10))
    
    classifiersBest.append(reg)
    
    cv_results_best = []
    for classifier in classifiersBest:
        cv_results_best.append(cross_validate(classifier, xTrain, yTrain, scoring = 'neg_mean_squared_error', cv = 3))
        
    cv_means_best = []
    cv_std_best = []
    for cv_result in cv_results_best:
        cv_means_best.append(abs(statistics.mean(cv_result['test_score'])))
        cv_std_best.append(statistics.stdev(cv_result['test_score']))

    cv_res_best = pd.DataFrame({"Mean_Squared_Errors":cv_means_best,"Algorithm":["RF","DT", "GB", "Ensemble"]})
    
    #print(cv_res)
    
    cv_res_best.plot(kind='bar',x='Algorithm',y='Mean_Squared_Errors')
    plt.show()
    
    # UNTIL HERE
    
    '''

    #Stacking using optimized models and sklearn
    '''
    bestRFR.fit(xTrain,yTrain)
    bestDTR.fit(xTrain,yTrain)
    bestGBR.fit(xTrain,yTrain)
    '''

    #print("Ensemble Score: ", reg.fit(xTrain, yTrain).score(xTest, yTest))
    '''

    #Optimize support vector machine and predict
    SVM = SVC(probability=True)
    svc_grid = {'gamma': [ 0.001, 0.01, 0.1, 1],
                      'C': [1, 10, 50, 100, 250]}

    gsSVM = GridSearchCV(SVM,param_grid = svc_grid, cv=kfold, n_jobs = -1, scoring="accuracy", verbose = 1)
    gsSVM.fit(xTrain,yTrain)
    bestSVM = gsSVM.best_estimator_
    print(bestSVM.get_params())

    yHat = bestSVM.predict(xTest)
    fpr, tpr, _ = roc_curve(yTest, yHat)
    plt.plot(fpr, tpr, label="SVM")
    print('SVM Accuracy Score: ' + str(accuracy_score(yHat, yTest)))
    
    '''

    scoreArr = []

    #Optimize random forest and predict
    RFR = RandomForestRegressor()
    rf_grid = {
        "max_depth": [1, 3, 5, 10, 20, 50],
        "max_features": [5, 10, 20, 25],
        "min_samples_leaf": [1, 5, 10],
        "n_estimators": [100, 250, 500],
    }

    gsRFR = GridSearchCV(RFR,
                         param_grid=rf_grid,
                         cv=3,
                         n_jobs=-1,
                         scoring="neg_mean_squared_error",
                         verbose=2)
    gsRFR.fit(xTrain, yTrain)
    print(gsRFR.best_params_)

    #yHat = bestRFR.predict(xTest)
    bestRFR = gsRFR.best_estimator_
    grid_accuracy = evaluate(bestRFR, xTest, yTest)

    print("Accuracy: ", grid_accuracy)
    scoreArr.append(grid_accuracy)

    #Optimize decision tree and predict
    KNN = KNeighborsRegressor()
    knn_grid = {"n_neighbors": [1, 3, 5, 10, 15, 20]}

    gsKNN = GridSearchCV(KNN,
                         param_grid=knn_grid,
                         cv=3,
                         n_jobs=-1,
                         scoring="neg_mean_squared_error",
                         verbose=2)
    gsKNN.fit(xTrain, yTrain)
    #bestDTR = gsDTR.best_params_
    print(gsKNN.best_params_)

    #yHat = bestDTR.predict(xTest)
    bestKNN = gsKNN.best_estimator_
    grid_accuracy = evaluate(bestKNN, xTest, yTest)

    print("Accuracy: ", grid_accuracy)
    scoreArr.append(grid_accuracy)

    #Optimize gradient boosting and predict
    GBR = GradientBoostingRegressor()
    gbr_grid = {
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
        "max_features": [5, 10, 20, 25],
        "min_samples_leaf": [1, 5, 10],
        "max_depth": [1, 3, 5]
    }

    gsGBR = GridSearchCV(GBR,
                         param_grid=gbr_grid,
                         cv=3,
                         n_jobs=-1,
                         scoring="neg_mean_squared_error",
                         verbose=2)
    gsGBR.fit(xTrain, yTrain)
    #bestGBR = gsGBR.best_params_
    print(gsGBR.best_params_)

    #yHat = bestDTR.predict(xTest)
    bestGBR = gsGBR.best_estimator_
    grid_accuracy = evaluate(bestGBR, xTest, yTest)

    print("Accuracy: ", grid_accuracy)
    scoreArr.append(grid_accuracy)

    # Stacking Ensemble

    estimators = [('RFR', bestRFR), ('KNN', bestKNN), ('GBR', bestGBR)]

    #Stacking using optimized models and sklearn
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=RandomForestRegressor(
                                random_state=random_state, n_estimators=10))

    for x in scoreArr:
        print("Accuracy: ", x)

    print("Ensemble Score: ", reg.fit(xTrain, yTrain).score(xTest, yTest))

    return None
Example #11
0
y_train_pred4 = KR.predict(X_train)

# The final prediction model is a Stacked Regressor taking the best esstimates of each of the regressors and combining the accuracy of the preictive models into one more accurate model.

# In[18]:

from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
estimators = [('ridge', RidgeCV()), ('lasso', LassoCV(random_state=42)),
              ('svr', SVR(C=1, gamma=1e-6))]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=GradientBoostingRegressor(random_state=42))
reg.fit(X_train, y_train)

y_pred6 = reg.predict(X_test)
y_train_pred6 = reg.predict(X_train)

# This is a output of the accucy of the predictions

# In[24]:

from sklearn.metrics import r2_score
print("Lasso Train accuracy: ", r2_score(y_train, y_train_pred))
print("Test accuracy: ", r2_score(y_test, y_pred))

print("ElasticNet Train accuracy: ", r2_score(y_train, y_train_pred2))

# ## Ensemble method (stacking)

# In[21]:


X_train = X_train_c
MLP = MLPRegressor(activation = 'relu', alpha = 1, hidden_layer_sizes = (100))

estimators = [
    ('RandomForest', RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)),
    ('MLP', MLP)
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=MLP)


scores = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
print(scores)
print(scores.mean())
print(scores.std())


# In[45]:


X_train = X_train_c
MLP = MLPRegressor(activation = 'relu', alpha = 1, hidden_layer_sizes = (100), random_state=0)

estimators = [
Example #13
0
# (1) lasso-final stacking = 0.12349
# (2) ridge-final stacking = 0.12436
# estimators = estimators_list

# 4 base models for ridge-final stacking model, = 0.12296 / 0.12264
estimators = [
    estimators_list[0], estimators_list[5], estimators_list[6],
    estimators_list[7]
]

# 4 base models for lasso-final stacking model, = 0.12284
# estimators = [estimators_list[0],estimators_list[5],
#               estimators_list[6],estimators_list[4]]

print(estimators)
stack = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
# stack = StackingRegressor(estimators=estimators,final_estimator = LassoCV())

# show CV performance of the selected stacking model
y_pred, scores = show_CV_performance(X, y, stack, nfold=nfold, title='stack')
scores

#%% plot single model vs stacking
estimators_all = estimators + [('Stacking model', stack)]

# plot all in one fig
fig, axs = plt.subplots(3, 2, figsize=(10, 8))
axs = np.ravel(axs)

for ax, (name, est) in zip(axs, estimators_all):
    start_time = time.time()
Example #14
0
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))


@pytest.mark.parametrize(
    "estimator, X, y",
    [
        (StackingClassifier(
            estimators=[('lr', LogisticRegression(
                random_state=0)), ('svm', LinearSVC(random_state=0))]),
         X_iris[:100], y_iris[:100]),  # keep only classes 0 and 1
        (StackingRegressor(estimators=[(
            'lr', LinearRegression()), ('svm', LinearSVR(random_state=0))]),
         X_diabetes, y_diabetes)
    ],
    ids=['StackingClassifier', 'StackingRegressor'])
def test_stacking_randomness(estimator, X, y):
    # checking that fixing the random state of the CV will lead to the same
    # results
    estimator_full = clone(estimator)
    estimator_full.set_params(
        cv=KFold(shuffle=True, random_state=np.random.RandomState(0)))

    estimator_drop = clone(estimator)
    estimator_drop.set_params(lr='drop')
    estimator_drop.set_params(
        cv=KFold(shuffle=True, random_state=np.random.RandomState(0)))
    def train_algs(self):
        """

        TRAIN WlTHOUT CROSS VALIDATION

        """

        st.subheader("Results")
        self.chosen_models_names = []
        self.chosen_models = []

        if len(self.algorithms) == 0:
            st.warning('You should select at least one algorithm')
            return

        X = self.raw_data.drop(self.out_col, axis=1)
        y = self.raw_data[self.out_col]
        msk = np.random.rand(len(X)) < self.percent_train / 100
        X_train = X[msk]
        X_test = X[~msk]
        Y_train = y[msk]
        Y_test = y[~msk]

        for alg in self.algorithms:

            if alg == 'LinearSVR':
                from sklearn.svm import LinearSVR
                svc = LinearSVR()
                svc.fit(X_train, Y_train)
                st.write("LinearSVR score", svc.score(X_test, Y_test))

                self.chosen_models_names.append('LinearSVR')
                self.chosen_models.append(svc)

            elif alg == 'RidgeCV':
                from sklearn.linear_model import RidgeCV
                rid = RidgeCV()
                rid.fit(X_train, Y_train)
                st.write("RidgeCV score", rid.score(X_test, Y_test))

                self.chosen_models_names.append('RidgeCV')
                self.chosen_models.append(rid)

            elif alg == 'Random Forest Regressor':
                from sklearn.ensemble import RandomForestRegressor
                rfc = RandomForestRegressor()
                rfc.fit(X_train, Y_train)
                st.write("rfc score", rfc.score(X_test, Y_test))

                self.chosen_models_names.append('Random Forest Regressor')
                self.chosen_models.append(rfc)

            elif alg == 'Adaboost':
                from sklearn.ensemble import AdaBoostRegressor
                ada = AdaBoostRegressor()
                ada.fit(X_train, Y_train)
                st.write("ada score", ada.score(X_test, Y_test))

                self.chosen_models_names.append('Adaboost')
                self.chosen_models.append(ada)

            elif alg == 'XGBoost':
                import xgboost as xgb
                xgb = xgb.XGBRegressor(n_estimators=300)
                xgb.fit(X_train, Y_train, verbose=0)
                st.write("xgb score", xgb.score(X_test, Y_test))

                self.chosen_models_names.append('XGBoost')
                self.chosen_models.append(xgb)

        if self.meta_model_check:
            if self.meta_model_type == "voting":
                from sklearn.ensemble import VotingRegressor
                stack = VotingRegressor(estimators=list(
                    zip(self.chosen_models_names, self.chosen_models)))
                stack.fit(X_train, Y_train)
                st.write("voting score", stack.score(X_test, Y_test))

            else:
                from sklearn.ensemble import StackingRegressor

                if self.meta_model == "GradientBoostingRegressor":
                    from sklearn.ensemble import GradientBoostingRegressor
                    stack = StackingRegressor(
                        estimators=list(
                            zip(self.chosen_models_names, self.chosen_models)),
                        final_estimator=GradientBoostingRegressor())

                elif self.meta_model == "RandomForestRegressor":
                    from sklearn.ensemble import RandomForestRegressor
                    stack = StackingRegressor(
                        estimators=list(
                            zip(self.chosen_models_names, self.chosen_models)),
                        final_estimator=RandomForestRegressor())

                stack.fit(X_train, Y_train)
                st.write("stack score", stack.score(X_test, Y_test))
Example #16
0
def run(dataset, config):
    log.info(
        f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n")
    save_metadata(config, version=sklearn.__version__)

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config
    estimators_params = {
        e: config.framework_params.get(f'_{e}_params', {})
        for e in ['rf', 'gbm', 'linear', 'svc', 'final']
    }

    log.info(
        "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores."
        .format(config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    if is_classification:
        estimator = StackingClassifier(
            estimators=[
                ('rf',
                 RandomForestClassifier(n_jobs=n_jobs,
                                        random_state=config.seed,
                                        **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingClassifier(random_state=config.seed,
                                            **estimators_params['gbm'])),
                ('linear',
                 SGDClassifier(n_jobs=n_jobs,
                               random_state=config.seed,
                               **estimators_params['linear'])),
                # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
            ],
            # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
            final_estimator=LogisticRegression(n_jobs=n_jobs,
                                               random_state=config.seed,
                                               **estimators_params['final']),
            stack_method='predict_proba',
            n_jobs=n_jobs,
            **training_params)
    else:
        estimator = StackingRegressor(
            estimators=[
                ('rf',
                 RandomForestRegressor(n_jobs=n_jobs,
                                       random_state=config.seed,
                                       **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingRegressor(random_state=config.seed,
                                           **estimators_params['gbm'])),
                ('linear',
                 SGDRegressor(random_state=config.seed,
                              **estimators_params['linear'])),
                ('svc',
                 LinearSVR(random_state=config.seed,
                           **estimators_params['svc']))
            ],
            # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
            final_estimator=LinearRegression(n_jobs=n_jobs,
                                             random_state=config.seed,
                                             **estimators_params['final']),
            n_jobs=n_jobs,
            **training_params)

    with utils.Timer() as training:
        estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = estimator.predict_proba(
        X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.estimators_) + 1,
                  training_duration=training.duration)
Example #17
0
from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

X, y = load_diabetes(return_X_y=True)
estimators = [("lr", RidgeCV()), ("svr", LinearSVR(random_state=42))]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10, random_state=42),
)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train).score(X_test, y_test)
# 0.3...
Example #18
0

@pytest.mark.parametrize(
    "estimator, X, y",
    [
        (
            StackingClassifier(estimators=[
                ("lr", LogisticRegression(random_state=0)),
                ("svm", LinearSVC(random_state=0)),
            ]),
            X_iris[:100],
            y_iris[:100],
        ),  # keep only classes 0 and 1
        (
            StackingRegressor(estimators=[
                ("lr", LinearRegression()),
                ("svm", LinearSVR(random_state=0)),
            ]),
            X_diabetes,
            y_diabetes,
        ),
    ],
    ids=["StackingClassifier", "StackingRegressor"],
)
def test_stacking_randomness(estimator, X, y):
    # checking that fixing the random state of the CV will lead to the same
    # results
    estimator_full = clone(estimator)
    estimator_full.set_params(
        cv=KFold(shuffle=True, random_state=np.random.RandomState(0)))

    estimator_drop = clone(estimator)
Example #19
0
         'estimators': [('lr', LinearRegression()), ('svm', LinearSVR())],
         'final_estimator': RandomForestClassifier()
     }, ValueError, 'parameter should be a regressor.')])
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))


@pytest.mark.parametrize("stacking_estimator", [
    StackingClassifier(
        estimators=[('lr', LogisticRegression()), ('svm', LinearSVC())]),
    StackingRegressor(
        estimators=[('lr', LinearRegression()), ('svm',
                                                 LinearSVR(max_iter=1e4))])
])
def test_stacking_named_estimators(stacking_estimator):
    stacking_estimator.fit(scale(X_iris), y_iris)
    estimators = stacking_estimator.named_estimators_
    assert len(estimators) == 2
    assert sorted(list(estimators.keys())) == sorted(['lr', 'svm'])


@pytest.mark.parametrize("stacking_estimator", [
    StackingClassifier(estimators=[(
        'lr', LogisticRegression()), (
            'rf', RandomForestClassifier()), ('svm', LinearSVC())]),
    StackingRegressor(
        estimators=[('lr', LinearRegression()), (
Example #20
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
    reg = StackingRegressor(
        estimators=estimators,
        final_estimator=final_estimator,
        cv=cv,
        passthrough=passthrough,
    )
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    expected_column_count = 12 if passthrough else 2
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])

    reg.set_params(lr="drop")
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    expected_column_count_drop = 11 if passthrough else 1
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])
Example #21
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv)
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    assert X_trans.shape[1] == 2

    reg.set_params(lr='drop')
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    assert X_trans.shape[1] == 1
Example #22
0
def model_to_test_reg():
    estimators = [('dt', DecisionTreeRegressor()), ('las', LinearRegression())]
    stacking_regressor = StackingRegressor(estimators=estimators,
                                           final_estimator=LinearRegression())
    return stacking_regressor
gbdt_pipeline = make_pipeline(
    tree_preprocessor, HistGradientBoostingRegressor(random_state=0)
)
gbdt_pipeline

# %%
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ("Random Forest", rf_pipeline),
    ("Lasso", lasso_pipeline),
    ("Gradient Boosting", gbdt_pipeline),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_regressor

# %%
# Measure and plot the results
##############################################################################
#
# Now we can use Ames Housing dataset to make the predictions. We check the
# performance of each individual predictor as well as of the stack of the
# regressors.
#
# The function ``plot_regression_results`` is used to plot the predicted and
# true targets.


import time
fit_param = {
    'eval_set': [(X_train, y_train), (X_test, y_test)],
    'early_stopping_rounds': 200,
    'verbose': False
}
BT = xgb.XGBRegressor(**param)
SVM = svm.SVR()
RF = ensemble.RandomForestRegressor(random_state=42)
NN = neural_network.MLPRegressor(hidden_layer_sizes=(100, ),
                                 random_state=1,
                                 max_iter=100,
                                 alpha=0.001)
estimators = [('dt', DT), ('bt', BT), ('lgb', LGB), ('rf', RF), ('gb', GBoost)]

reg = StackingRegressor(estimators=estimators,
                        final_estimator=LinearRegression(),
                        n_jobs=-1)
stack = Regressor(reg)
y_pred = stack.run(X_train, y_train, X_test, y_test)

DT = tree.DecisionTreeClassifier()
import xgboost as xgb

param = {
    'n_estimators': 10000,
    'learning_rate': 0.1,
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'n_jobs': -1
}
fit_param = {
Example #25
0
plt.scatter(X, y)
plt.plot(x_rescaled, y_pred_rescaled, color='red', label='predictions')
plt.xlabel("LotArea in m$^2$")
plt.ylabel("SalePrice in ZAR")
plt.title("Voting Ensemble Regression")
plt.legend()
plt.show()

# Heterogeneous Ensembles(Stacking)
models = [("LR", lr), ("DT", regr_tree), ("SVR", svr)]

# instead of choosing model weights, stacking uses a meta learner
# models training happens twice. once for base models, once for meta learner
meta_learner_reg = LinearRegression()

s_reg = StackingRegressor(estimators=models, final_estimator=meta_learner_reg)

s_reg.fit(x_train, y_train[:, 0])

y_pred = s_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Plot stacking regression prediction line over data
x_domain = np.linspace(min(x_train), max(x_train), 100)

y_pred_rescaled = y_scaler.inverse_transform(s_reg.predict(x_domain))
x_rescaled = x_scaler.inverse_transform(x_domain)

plt.figure()
plt.scatter(X, y)
Example #26
0
def train(seeds=[1], k=5, datafilepath='./data/HRB95.txt', test_size=5):
    seed = 2
    random.seed(seed)
    np.random.seed(seed)
    # data = np.loadtxt('./data/HRB95.txt', dtype=float, delimiter=',', skiprows=1)
    # x = data[:,1:data.shape[1]]
    # y = data[:,0]
    cv = k
    if cv == 1:
        cv = LeaveOneOut()
    models = [
        KNeighborsRegressor(leaf_size=3,
                            n_neighbors=2,
                            p=1,
                            weights='distance'),
        GridSearchCV(SVR(),
                     param_grid={
                         "C": np.logspace(0, 2, 4),
                         "gamma": np.logspace(-2, 2, 7)
                     },
                     n_jobs=-1),
        RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0)),
        MLPRegressor(hidden_layer_sizes=(50, 100, 50),
                     max_iter=700,
                     random_state=seed),
        RandomForestRegressor(random_state=seed),
        GradientBoostingRegressor(random_state=seed),
        StackingRegressor(estimators=[
            ('KNN',
             KNeighborsRegressor(leaf_size=3,
                                 n_neighbors=2,
                                 p=1,
                                 weights='distance')),
            ("ridge", RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))),
            ("gbdt", GradientBoostingRegressor(random_state=seed)),
            ("RandomForest", RandomForestRegressor(random_state=seed)),
            ("mlp",
             MLPRegressor(hidden_layer_sizes=(50, 100, 50),
                          max_iter=700,
                          random_state=seed)),
            ("svr",
             GridSearchCV(SVR(),
                          n_jobs=-1,
                          param_grid={
                              "C": np.logspace(0, 2, 4),
                              "gamma": np.logspace(-2, 2, 7)
                          })),
        ],
                          final_estimator=RidgeCV(alphas=(0.1, 1.0, 10.0,
                                                          100.0)),
                          n_jobs=-1,
                          cv=cv),
    ]
    models_str = [
        'KNeighborsRegressor',
        'SVR',
        'RidgeCV',
        'MLP',
        'RF',
        'GBDT',
        'Stacking',
    ]

    #times次平均得分,
    MAE, MSE, R2 = {}, {}, {}
    for time, seed in enumerate(seeds):
        print("-----第%d次(seed=%s)-----" % (time + 1, seed))
        print("{:20s}{:10s}{:10s}{:10s}".format("方法", "MAE", "MSE", "R2"))
        x, y = loadXY(datafilepath)
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size, random_state=seed, shuffle=True)
        plt.figure(time, figsize=(10, 10))
        plt.tick_params(labelsize=18)
        # plt.xlim(0, 6)
        # plt.ylim(3, 7, 0.3)
        # plt.plot([x for x in range(1, test_size + 1)],scale_y.inverse_transform(y_test),label='True Label')
        plt.scatter([x for x in range(1, test_size + 1)],
                    scale_y.inverse_transform(y_test),
                    marker='*',
                    label='True Label',
                    s=250)
        for i, name, m in zip(range(100), models_str, models):
            if not name in MAE.keys():
                MAE[name] = []
            if not name in MSE.keys():
                MSE[name] = []
            if not name in R2.keys():
                R2[name] = []
            print("%18s" % name)
            y_vals, y_val_p_s, mae_test, mse_test, r2_test = [], [], [], [], []
            model = clone(m)
            # stacking模型,已经内置交叉验证
            if isinstance(model, StackingRegressor):
                model.fit(x_train, y_train)
                train_pred = model.predict(x_train)
                test_pred = model.predict(x_test)
                MAE[name] = np.append(MAE[name], mae(test_pred, y_test))
                MSE[name] = np.append(MSE[name], mse(test_pred, y_test))
                R2[name] = np.append(R2[name], model.score(x_test, y_test))
                print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(
                    "train", mae(train_pred, y_train),
                    mse(train_pred, y_train), model.score(x_train, y_train)))
                print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(
                    "test", MAE[name][-1], MSE[name][-1], R2[name][-1]))
            else:
                # 交叉验证
                if k > 1:
                    kf = RepeatedKFold(n_splits=k,
                                       n_repeats=10,
                                       random_state=seed)
                else:
                    kf = LeaveOneOut()
                for t, v in kf.split(x_train):
                    model.fit(x_train[t], y_train[t])  # fitting
                    y_val_p = model.predict(x_train[v])
                    y_vals = np.append(y_vals, y_train[v])
                    y_val_p_s = np.append(y_val_p_s, y_val_p)
                test_pred = model.predict(x_test)
                mse_test = np.append(mse_test, mse(y_test, test_pred))
                mae_test = np.append(mae_test, mae(y_test, test_pred))
                r2_test = np.append(r2_test, model.score(x_test, y_test))
                matrix = {
                    'val': {
                        'mae': mae(y_vals, y_val_p_s),
                        'mse': mse(y_vals, y_val_p_s),
                        'r2': r2_score(y_vals, y_val_p_s)
                    },
                    'test': {
                        'mae': mae_test.mean(),
                        'mse': mse_test.mean(),
                        'r2': r2_test.mean()
                    },
                }
                print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(
                    "val",
                    matrix['val']['mae'],
                    matrix['val']['mse'],
                    matrix['val']['r2'],
                ))
                print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(
                    "test", matrix['test']['mae'], matrix['test']['mse'],
                    matrix['test']['r2']))
                joblib.dump(model, 'save/%s%d.model' % (name, time))
                MAE[name] = np.append(MAE[name], matrix['test']['mae'])
                MSE[name] = np.append(MSE[name], matrix['test']['mse'])
                R2[name] = np.append(R2[name], matrix['test']['r2'])

            plt.plot([x for x in range(1, test_size + 1)],
                     scale_y.inverse_transform(model.predict(x_test)),
                     marker='o',
                     linestyle=':',
                     label=name,
                     c=colors.pop())
            # plt.scatter([x+i*0.2 for x in range(1, test_size + 1)], scale_y.inverse_transform(model.predict(x_test)),
            #             label=name,c=randomcolor())
            plt.legend(edgecolor='black', loc=1, prop=font2, ncol=2)  # 让图例标签展示
            plt.xlabel(u"Test Data", fontdict=font1)  # X轴标签
            plt.ylabel('Density (g/cm3)', fontdict=font1)  # Y轴标签
            plt.title('Prediction on GI20', fontdict=font1)  # 标题
        plt.ioff()
        print()  #所有模型交叉训练结束(一次) 每一次样本集不一样
        plt.show()
    print("---------%d次训练测试平均得分----------" % len(seeds))
    print("{:20s}{:10s}{:10s}{:10s}".format("方法", "MAE", "MSE", "R2"))
    for name in MAE.keys():
        print("{:20s}{:6.4f}{:10.4f}{:10.3f}".format(name, np.mean(MAE[name]),
                                                     np.mean(MSE[name]),
                                                     np.mean(R2[name])))
glm = TweedieGLM(power=0, max_iter=1000)
mars = Earth()
gb = HistGradientBoostingRegressor()
estimators = [
    ('mars', mars),
    ('gb', gb)
]

final_estimator = Pipeline([
                        ('poly', PolynomialFeatures(2)),
                        ('scale', StandardScaler()),
                        ('pca', PCA()),
                        ('regressor', LinearRegression())])

stack = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator
)

model = Pipeline([('transformer', transformer),
                  ('model', stack)])

offset = 1e-9
def add(y):
    return np.log(y + offset)

def subtract(y):
    return (np.exp(y) - offset)


link = Pipeline([('function', FunctionTransformer(add, subtract, validate=True))])
scorer = get_scorer('neg_root_mean_squared_error')
    def train_algs_cv(self):
        """

        TRAIN USING CROSS VALIDATION

        """

        st.subheader("Results using cross validation")
        self.chosen_models_names = []
        self.chosen_models = []

        if len(self.algorithms) == 0:
            st.warning('You should select at least one algorithm')
            return

        X_train = self.raw_data.drop(self.out_col, axis=1)
        Y_train = self.raw_data[self.out_col]

        for alg in self.algorithms:

            if alg == 'LinearSVR':
                from sklearn.svm import LinearSVR
                svc = LinearSVR()
                svc_scores = cross_val_score(svc,
                                             X_train,
                                             Y_train,
                                             scoring='r2',
                                             cv=self.k_cv)
                st.write('LinearSVR score :', svc_scores.mean())

                self.chosen_models_names.append('LinearSVR')
                self.chosen_models.append(svc)

            elif alg == 'RidgeCV':
                from sklearn.linear_model import RidgeCV
                rid = RidgeCV()
                rid_scores = cross_val_score(rid,
                                             X_train,
                                             Y_train,
                                             scoring='r2',
                                             cv=self.k_cv)
                st.write('RidgeCV score :', rid_scores.mean())

                self.chosen_models_names.append('RidgeCV')
                self.chosen_models.append(rid)

            elif alg == 'Random Forest Regressor':
                from sklearn.ensemble import RandomForestRegressor
                rfc = RandomForestRegressor()
                rfc_scores = cross_val_score(rfc,
                                             X_train,
                                             Y_train,
                                             scoring='r2',
                                             cv=self.k_cv)
                st.write('Random Forest Regressor score :', rfc_scores.mean())

                self.chosen_models_names.append('Random Forest Regressor')
                self.chosen_models.append(rfc)

            elif alg == 'Adaboost':
                from sklearn.ensemble import AdaBoostRegressor
                ada = AdaBoostRegressor()
                ada_scores = cross_val_score(ada,
                                             X_train,
                                             Y_train,
                                             scoring='r2',
                                             cv=self.k_cv)
                st.write('Adaboost score :', ada_scores.mean())

                self.chosen_models_names.append('Adaboost')
                self.chosen_models.append(ada)

            elif alg == 'XGBoost':
                import xgboost as xgb
                xgb = xgb.XGBRegressor(n_estimators=300)
                xgb_scores = cross_val_score(xgb,
                                             X_train,
                                             Y_train,
                                             scoring='r2',
                                             cv=self.k_cv)
                st.write('xgb score :', xgb_scores.mean())

                self.chosen_models_names.append('XGBoost')
                self.chosen_models.append(xgb)

        if self.meta_model_check:
            if self.meta_model_type == "voting":
                from sklearn.ensemble import VotingRegressor
                stack = VotingRegressor(estimators=list(
                    zip(self.chosen_models_names, self.chosen_models)))
                stack_scores = cross_val_score(stack,
                                               X_train,
                                               Y_train,
                                               scoring='r2',
                                               cv=self.k_cv)
                st.write('voting score :', stack_scores.mean())

            else:
                from sklearn.ensemble import StackingRegressor

                if self.meta_model == "GradientBoostingRegressor":
                    from sklearn.ensemble import GradientBoostingRegressor
                    stack = StackingRegressor(
                        estimators=list(
                            zip(self.chosen_models_names, self.chosen_models)),
                        final_estimator=GradientBoostingRegressor())

                elif self.meta_model == "RandomForestRegressor":
                    from sklearn.ensemble import RandomForestRegressor
                    stack = StackingRegressor(
                        estimators=list(
                            zip(self.chosen_models_names, self.chosen_models)),
                        final_estimator=RandomForestRegressor())

                stack_scores = cross_val_score(stack,
                                               X_train,
                                               Y_train,
                                               scoring='r2',
                                               cv=self.k_cv)
                st.write(self.meta_model + ' stack score using cv :',
                         stack_scores.mean())
Example #29
0
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR


@pytest.mark.parametrize(
    "X, y, estimator",
    [(*make_classification(n_samples=10),
      StackingClassifier(
          estimators=[('lr', LogisticRegression()), (
              'svm', LinearSVC()), ('rf', RandomForestClassifier())])),
     (*make_classification(n_samples=10),
      VotingClassifier(
          estimators=[('lr', LogisticRegression()), (
              'svm', LinearSVC()), ('rf', RandomForestClassifier())])),
     (*make_regression(n_samples=10),
      StackingRegressor(
          estimators=[('lr', LinearRegression()), (
              'svm', LinearSVR()), ('rf', RandomForestRegressor())])),
     (*make_regression(n_samples=10),
      VotingRegressor(
          estimators=[('lr', LinearRegression()), (
              'svm', LinearSVR()), ('rf', RandomForestRegressor())]))],
    ids=[
        'stacking-classifier', 'voting-classifier', 'stacking-regressor',
        'voting-regressor'
    ])
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
    # check that the behavior of `estimators`, `estimators_`,
    # `named_estimators`, `named_estimators_` is consistent across all
    # ensemble classes and when using `set_params()`.

    # before fit
Example #30
0
def learner(X, y, tes, X_train, X_test, y_train, y_test):

    # In stacking, the most important thing is model diversification. from linear, SVM, KNN and Decision trees and many variations of them.
    # The variations are different values of key parameters of each model.
    # While we did not have the time to tune parameters of each model, except the meta learner Catboost, educated guesses on
    # the parameters were made to have as much variability as possible.

    estimators_1 = [
        ('xgb',
         XGBRegressor(random_state=2020,
                      objective='reg:squarederror',
                      learning_rate=0.05)), ('lr', LinearRegression()),
        ('rf', RandomForestRegressor(random_state=2020)),
        ('lgb', LGBMRegressor(learning_rate=0.2,
                              random_state=2020)), ('svr', SVR(degree=2)),
        ('lasso', Lasso(random_state=2020)), ('RGF', RGFRegressor()),
        ('kneiba', KNeighborsRegressor(n_neighbors=4)),
        ('cat', CatBoostRegressor(logging_level='Silent', random_state=2020))
    ]

    predictions_1 = StackingRegressor(estimators=estimators_1,
                                      final_estimator=CatBoostRegressor(
                                          logging_level='Silent',
                                          depth=6,
                                          bagging_temperature=5,
                                          random_state=2020)).fit(
                                              X_train, y_train).predict(tes)

    estimators_2 = [('xgb',
                     XGBRegressor(objective='reg:squarederror',
                                  learning_rate=0.2,
                                  random_state=2020)),
                    ('lr', LinearRegression()),
                    ('rf', RandomForestRegressor(random_state=2020)),
                    ('lgb', LGBMRegressor(learning_rate=0.05,
                                          random_state=2020)),
                    ('svr', SVR(degree=5)), ('RGF', RGFRegressor()),
                    ('lasso', Lasso(random_state=2020)),
                    ('kneiba', KNeighborsRegressor(n_neighbors=6)),
                    ('cat',
                     CatBoostRegressor(logging_level='Silent',
                                       random_state=2020))]

    predictions_2 = StackingRegressor(estimators=estimators_2,
                                      final_estimator=CatBoostRegressor(
                                          logging_level='Silent',
                                          depth=6,
                                          bagging_temperature=5,
                                          random_state=2020)).fit(
                                              X_train, y_train).predict(tes)

    predictions_cat_1 = CatBoostRegressor(logging_level='Silent',
                                          depth=6,
                                          bagging_temperature=5,
                                          random_state=2020).fit(
                                              X_train, y_train).predict(tes)

    # Further averaging, blending and retraining to generalise well
    # While the ratios are greater than one, it still works a treat. This is definitely one of the parameters to tune to achieve great results.
    stack = [x * 0.56 + y * 0.51 for x, y in zip(predictions_1, predictions_2)]
    stack_2 = [x * 0.56 + y * 0.51 for x, y in zip(stack, predictions_cat_1)]

    X, y = tes.copy(), stack_2
    preds_ridge = Ridge(random_state=2020).fit(X, y).predict(X)

    # We added a new feature to the test dataset, where we clustered the wards to 150 clusters, then used Catboost's encoder to encode the clusters.
    X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X)
    preds_cat = CatBoostRegressor(random_state=2020,
                                  verbose=False,
                                  depth=6,
                                  bagging_temperature=5,
                                  cat_features=['cluster']).fit(X,
                                                                y).predict(X)

    # blended the Ridge and Catboost predictions.
    final_blend_2 = [x * 0.2 + y * 0.8 for x, y in zip(preds_ridge, preds_cat)]

    # Clipping the values from between 0 - 90 was also important as we know that the target variable is between 0 to 100.
    final_blend_2 = np.clip(final_blend_2, a_min=0, a_max=90)

    # Applying regularization to the final blend by substracting a constant from the predictions and clipping again.
    exp = final_blend_2 - 0.48
    exp = np.clip(exp, a_min=0, a_max=90)

    ## Retraining predictions

    # Retraining on the test data by using the prediction of the stacked regressors as our target.
    # We also added the clusters but had to manually mean encode the clusters to the target variable as LinearRegression cannot encode categorical variables.
    X = tes.copy()

    X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X)
    X['target'] = exp
    X['encoded'] = X['cluster'].map(X.groupby('cluster')['target'].mean())
    y = X.target
    X = X.drop(['cluster', 'target'], 1)
    preds_1 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.7 + LinearRegression().fit(X, y).predict(X) * 0.3
    preds_2 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.5 + LinearRegression().fit(X, y).predict(X) * 0.5
    preds_3 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.6 + LinearRegression().fit(X, y).predict(X) * 0.4

    final = [
        x * 0.3 + y * 0.3 + z * 0.4
        for x, y, z in zip(preds_1, preds_2, preds_3)
    ]

    ## Further retraining of predictions

    # Retraining again this time using Regularized Greedy Forests and Catboost.
    X['final'] = final
    y = X.final
    X = X.drop('final', 1)
    preds_1 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.7 + RGFRegressor().fit(X, y).predict(X) * 0.3
    preds_2 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.5 + RGFRegressor().fit(X, y).predict(X) * 0.5
    preds_3 = CatBoostRegressor(verbose=False, random_state=2020).fit(
        X, y).predict(X) * 0.6 + RGFRegressor().fit(X, y).predict(X) * 0.4

    final2 = [
        x * 0.3 + y * 0.3 + z * 0.4
        for x, y, z in zip(preds_1, preds_2, preds_3)
    ]

    return final2