Esempio n. 1
0
def test_features_in_secondary():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    rf = RandomForestRegressor(random_state=2)
    ridge = Ridge(random_state=0)
    svr_rbf = SVR(kernel='rbf')
    stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf],
                              meta_regressor=svr_rbf,
                              use_features_in_secondary=True)

    stack.fit(X1, y).predict(X1)
    mse = 0.14
    got = np.mean((stack.predict(X1) - y)**2)
    print(got)
    assert round(got, 2) == mse

    stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf],
                              meta_regressor=svr_rbf,
                              use_features_in_secondary=False)

    # dense
    stack.fit(X1, y).predict(X1)
    mse = 0.12
    got = np.mean((stack.predict(X1) - y)**2)
    print(got)
    assert round(got, 2) == mse
Esempio n. 2
0
def test_features_in_secondary():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    rf = RandomForestRegressor(n_estimators=10, random_state=2)
    ridge = Ridge(random_state=0)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf],
                              meta_regressor=svr_rbf,
                              use_features_in_secondary=True)

    stack.fit(X1, y).predict(X1)
    mse = 0.14
    got = np.mean((stack.predict(X1) - y) ** 2)
    print(got)
    assert round(got, 2) == mse

    stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf],
                              meta_regressor=svr_rbf,
                              use_features_in_secondary=False)

    # dense
    stack.fit(X1, y).predict(X1)
    mse = 0.12
    got = np.mean((stack.predict(X1) - y) ** 2)
    print(got)
    assert round(got, 2) == mse
Esempio n. 3
0
    def test(self):
        df =pd.read_csv('MorganMACCS.csv')
        baseDf = df
        extractDf =  df['CAS'].isin(ejectCAS)
        df = df[~df['CAS'].isin(ejectCAS)]
        y = df['logTox']
        dropList = ['CAS','toxValue','logTox','HDonor', 'HAcceptors', 'AromaticHeterocycles', 'AromaticCarbocycles', 'FractionCSP3']
                    #dropList = ['CAS','toxValue','logTox']
        X = df.drop(columns=dropList)
        #Normalize
        for name in X.columns:
            if str.isdecimal(name)==True:
              if X[str(name)].sum() == 0:
                   print(name)
                   X = X.drop(columns=name)
            else:
                std =X[name].std()
                mean = X[name].mean()
                X[name] = X[name].apply(lambda x: ((x - mean) * 1 / std + 0))
        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=2)

        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        reg4.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
class RegressorBlender:
    def __init__(self, x_train, x_test, y_train, y_test=None):
        x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train['y'].values
        if self.y_train is not None:
            self.y_test = y_test['y'].values

    def reg_blend(self):
        mete_reg = LinearRegression()
        reg1 = model.svm_regressor()
        reg2 = model.randomforest_regressor()
        reg3 = model.xgb_regressor()
        self.blend = StackingRegressor(regressors=[reg1, reg2, reg3],
                                       meta_regressor=mete_reg)
        self.blend.fit(self.x_train, self.y_train)
        return self.blend

    def score(self):
        scores = cross_val_score(self.blend,
                                 X=self.x_train,
                                 y=self.y_train,
                                 cv=10,
                                 verbose=2)
        return scores

    def prediction(self):
        y_pred = self.blend.predict(self.x_test)
        return y_pred
Esempio n. 5
0
class Blend:
    def __init__(self, x_train, x_test, y_train):
        x_train.drop(['Unnamed: 0', 'PromoInterval', 'Date'],
                     axis=1,
                     inplace=True)
        x_test.drop(['Unnamed: 0', 'Id', 'PromoInterval', 'Date'],
                    axis=1,
                    inplace=True)
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train['Sales'].values

    def blending(self):
        mete_reg = LinearRegression()
        reg1 = model.svm_regressor()
        reg2 = model.randomforest_regressor()
        reg3 = model.xgb_regressor()
        self.blend = StackingRegressor(regressors=[reg1, reg2, reg3],
                                       meta_regressor=mete_reg)
        self.blend.fit(self.x_train, self.y_train)
        return self.blend

    def score(self):
        scores = cross_val_score(self.blend,
                                 X=self.x_train,
                                 y=self.y_train,
                                 cv=10,
                                 verbose=2)  # scoring='neg_mean_squared_error'
        return scores

    def prediction(self):
        y_pred = self.blend.predict(self.x_test)
        y_pred = np.expm1(y_pred)
        return y_pred
Esempio n. 6
0
def test_predict_meta_features():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
    stregr.fit(X_train, y_train)
    test_meta_features = stregr.predict(X_test)
    assert test_meta_features.shape[0] == X_test.shape[0]
Esempio n. 7
0
def test_multivariate_class():
    lr = LinearRegression()
    ridge = Ridge(random_state=1)
    meta = LinearRegression(normalize=True)
    stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta)
    stregr.fit(X2, y2).predict(X2)
    mse = 0.122
    got = np.mean((stregr.predict(X2) - y2)**2)
    assert round(got, 3) == mse
Esempio n. 8
0
 def stacking(self):
     from sklearn.svm import SVR
     from sklearn.pipeline import make_pipeline
     from sklearn.preprocessing import RobustScaler, MinMaxScaler
     from sklearn.preprocessing import StandardScaler
     from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
     from xgboost import XGBRegressor
     import lightgbm as lgb
     from lightgbm import LGBMRegressor
     import xgboost as xgb
     from mlxtend.regressor import StackingRegressor
     import scipy as sc
     s = make_pipeline(RobustScaler(), SVR(kernel='rbf', C=10, gamma=0.005))
     rf = make_pipeline(
         RandomForestRegressor(random_state=641,
                               n_estimators=250,
                               max_depth=9))
     GBoost = GradientBoostingRegressor(n_estimators=330,
                                        learning_rate=0.01,
                                        max_depth=12,
                                        max_features='sqrt',
                                        min_samples_leaf=1,
                                        min_samples_split=42,
                                        loss='ls',
                                        random_state=40,
                                        subsample=1)
     model_xgb = xgb.XGBRegressor(colsample_bytree=1,
                                  gamma=5,
                                  learning_rate=0.01,
                                  max_depth=11,
                                  min_child_weight=1.7817,
                                  n_estimators=500,
                                  reg_alpha=0.8,
                                  reg_lambda=5,
                                  subsample=0.5213,
                                  silent=1,
                                  seed=1024,
                                  nthread=-1)
     model_lgb = LGBMRegressor(objective='regression',
                               num_leaves=4,
                               learning_rate=0.05,
                               n_estimators=290,
                               max_bin=147,
                               subsample=0.65,
                               colsample_bytree=0.7,
                               feature_fraction_seed=46,
                               subsample_freq=9,
                               min_child_samples=20,
                               min_child_weight=0.001)
     regressors = [s, rf, GBoost, model_lgb, model_xgb]
     stregr = StackingRegressor(regressors=regressors,
                                meta_regressor=model_xgb)
     stregr.fit(self.X_train, self.y_train)
     print("the model is staking and the test's pearsonr is: ",
           sc.stats.pearsonr(self.y_test, stregr.predict(self.X_test))[0])
     return stregr
def test_predict_meta_features():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingRegressor(regressors=[lr, ridge],
                               meta_regressor=svr_rbf)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
    stregr.fit(X_train, y_train)
    test_meta_features = stregr.predict(X_test)
    assert test_meta_features.shape[0] == X_test.shape[0]
Esempio n. 10
0
def test_multivariate_class():
    lr = LinearRegression()
    ridge = Ridge(random_state=1)
    meta = LinearRegression(normalize=True)
    stregr = StackingRegressor(regressors=[lr, ridge],
                               meta_regressor=meta)
    stregr.fit(X2, y2).predict(X2)
    mse = 0.122
    got = np.mean((stregr.predict(X2) - y2) ** 2)
    assert round(got, 3) == mse
Esempio n. 11
0
def test_multivariate():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    stregr.fit(X2, y).predict(X2)
    mse = 0.218
    got = np.mean((stregr.predict(X2) - y) ** 2)
    assert round(got, 3) == mse
Esempio n. 12
0
def test_multivariate():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    stregr.fit(X2, y).predict(X2)
    mse = 0.22
    got = np.mean((stregr.predict(X2) - y)**2)
    assert round(got, 2) == mse
def test_different_models():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    stregr.fit(X1, y).predict(X1)
    mse = 0.21
    got = np.mean((stregr.predict(X1) - y)**2)
    assert round(got, 2) == mse
Esempio n. 14
0
def test_multivariate_class():
    lr = LinearRegression()
    ridge = Ridge(random_state=1)
    meta = LinearRegression(normalize=True)
    stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta)
    stregr.fit(X2, y2).predict(X2)
    mse = 0.12
    got = np.mean((stregr.predict(X2) - y2)**2.)
    # there seems to be an issue with the following test on Windows
    # sometimes via Appveyor
    assert round(got, 2) == mse, got
def test_different_models():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    y_pred = stregr.fit(X1, y).predict(X1)
    mse = 0.214
    got = np.mean((stregr.predict(X1) - y) ** 2)
    assert round(got, 3) == mse
def test_multivariate_class():
    lr = LinearRegression()
    ridge = Ridge(random_state=1)
    meta = LinearRegression(normalize=True)
    stregr = StackingRegressor(regressors=[lr, ridge],
                               meta_regressor=meta)
    stregr.fit(X2, y2).predict(X2)
    mse = 0.12
    got = np.mean((stregr.predict(X2) - y2) ** 2.)
    # there seems to be an issue with the following test on Windows
    # sometimes via Appveyor
    assert round(got, 2) == mse, got
Esempio n. 17
0
 def mlx_reg_1(self):
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     sclf = StackingRegresorMLX(
         regressors=[lr, rf, lasso],
         meta_regressor=RandomForestRegressor(ccp_alpha=0.1,
                                              max_features="auto",
                                              n_estimators=30)
     )
     sclf.fit(self.x_train, self.y_train)
     return sclf.predict(self.x_test)
def test_sparse_matrix_inputs_and_features_in_secondary():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    rf = RandomForestRegressor(n_estimators=10, random_state=2)
    ridge = Ridge(random_state=0)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf],
                              meta_regressor=svr_rbf,
                              use_features_in_secondary=True)

    # dense
    stack.fit(X1, y).predict(X1)
    mse = 0.14
    got = np.mean((stack.predict(X1) - y)**2)
    assert round(got, 2) == mse

    # sparse
    stack.fit(sparse.csr_matrix(X1), y)
    mse = 0.14
    got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y)**2)
    assert round(got, 2) == mse
Esempio n. 19
0
def regressionStacking(df):

    # StackingRegressor inputdata type is ndarray

    X_train, X_test, y_train, y_test = trainDataSplit(df)

    randomforest_regressor = RandomForestRegressor()

    # # lightgbm不是scikit-learn的包,mlxtend不支持
    # lgb_train = lightgbm.Dataset(X_train, y_train)
    # lgb_eval = lightgbm.Dataset(X_test, y_test, reference=lgb_train)
    #
    # # specify your configurations as a dict
    # params = {
    #     'task': 'train',
    #     'boosting_type': 'gbdt',
    #     'objective': 'regression',
    #     'metric': {'l2', 'auc'},
    #     'num_leaves': 2 ** 10,
    #     'learning_rate': 1.0,
    #     'feature_fraction': 0.9,
    #     'bagging_fraction': 0.8,
    #     'bagging_freq': 5,
    #     'verbose': 0
    # }
    # lightgbm_regressor = lightgbm.train(params,
    #                            lgb_train,
    #                            num_boost_round=20,
    #                            valid_sets=lgb_eval,
    #                            early_stopping_rounds=5)

    lasso_regressor = Lasso()

    dnn_regressor = MLPRegressor()

    linearRegression_regressor = LinearRegression()

    stacking_regressor = StackingRegressor(
        regressors=[randomforest_regressor, lasso_regressor, dnn_regressor],
        meta_regressor=linearRegression_regressor)

    stacking_regressor.fit(X_train, X_train)

    y_pred = stacking_regressor.predict(X_test)

    criterion_df, predict_result = predictResultOutput(stacking_regressor,
                                                       X_test, y_test, y_pred)

    # save model
    joblib.dump(stacking_regressor, 'stacking.model')

    return criterion_df, predict_result
Esempio n. 20
0
def sbg_mlxtend_ensamble(iterate):
    iterate += 501
    lin_mod = linear_model.LinearRegression()
    bsn_rdg = linear_model.BayesianRidge()
    elstc_nt = ElasticNet(alpha=0.2, l1_ratio=1)
    ridge = Ridge(alpha=0.01, tol=0.1, solver='sag')
    svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1)
    sgd_reg = linear_model.SGDRegressor(penalty='l2', alpha=0.001, n_iter=1000)
    lasso_reg = linear_model.Lasso(alpha=1,
                                   max_iter=3000,
                                   normalize='True',
                                   selection='random',
                                   tol=0.001)
    rndm_frst = RandomForestRegressor(max_depth=5, n_estimators=10)

    stregr = StackingRegressor(regressors=[sgd_reg, rndm_frst],
                               meta_regressor=ridge)

    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_Y2,
                                                        test_size=0.20,
                                                        random_state=iterate)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    stregr.fit(X_train, y_train)
    y_pred = stregr.predict(X_test)

    #print("Mean Squared Error: %.4f"
    #      % np.mean((y_pred - y_test.values) ** 2))
    #print('Variance Score: %.4f' % stregr.score(X_test, y_test.values))

    dev_Memory = abs(y_pred - y_test.values)
    mean_dev = np.mean(dev_Memory)
    mse_Memory = np.sqrt(np.sum(dev_Memory**2) / dev_Memory.size)
    mape = np.mean(dev_Memory / y_test.values)
    max_pe = np.max(dev_Memory)
    max_ne = np.max(np.negative(dev_Memory))
    new_data1 = pd.DataFrame(y_pred)
    new_data2 = pd.DataFrame(y_test.values)
    new_data = pd.merge(new_data1,
                        new_data2,
                        left_index=True,
                        right_index=True)

    filename12 = r'C:\Users\epatdeb\AlphaCANDI\SBG_Rawinput_1.6\latest\Logs\AlphaCandi17_MlxEnsmbl_Memory.log'
    logging.basicConfig(filename=filename12, level=logging.DEBUG)
    logging.info(
        "tensor_bp sbg_mlxtend_ensamble iter:%s \n \n y_pred/y_test: \n %s \n mae:%s mse:%s mape:%s max_pe:%s max_ne:%s",
        iterate, new_data, mean_dev, mse_Memory, mape, max_pe, max_ne)
    logging.shutdown()

    return mean_squared_error(y_test, y_pred), mean_dev, mape
Esempio n. 21
0
def Gbc():
    from sklearn.ensemble import GradientBoostingClassifier, AdaBoostRegressor
    from sklearn.linear_model import LogisticRegression
    from mlxtend.regressor import StackingRegressor
    from sklearn.svm import SVR
    adaboost = AdaBoostRegressor()
    lr = LogisticRegression
    gb = GradientBoostingClassifier()
    svr = SVR(kernel='linear')
    svr_rbf = SVR(kernel='rbf')
    regressors = [svr, adaboost, gb]
    stregr = StackingRegressor(regressors=regressors, meta_regressor=svr_rbf)
    stregr.fit(X_train, y_train)
    outpred = stregr.predict(X_valid)
    evaluate_strategy(outpred)
Esempio n. 22
0
def test_sample_weight():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1)
    mse = 0.22
    got = np.mean((stregr.predict(X1) - y)**2)
    assert round(got, 2) == mse
    # make sure that this is not equivalent to the model with no weight
    pred2 = stregr.fit(X1, y).predict(X1)
    maxdiff = np.max(np.abs(pred1 - pred2))
    assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
Esempio n. 23
0
def test_sample_weight():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                               meta_regressor=svr_rbf)
    pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1)
    mse = 0.22
    got = np.mean((stregr.predict(X1) - y) ** 2)
    assert round(got, 2) == mse
    # make sure that this is not equivalent to the model with no weight
    pred2 = stregr.fit(X1, y).predict(X1)
    maxdiff = np.max(np.abs(pred1 - pred2))
    assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
Esempio n. 24
0
def train_model(X_train, y_train):
    clf1 = LinearSVR()
    clf2 = LinearRegression()
    clf3 = Ridge()
    clf4 = LGBMRegressor()

    svr_linear = LinearSVR()
    sr = StackingRegressor(regressors=[clf1, clf2, clf3, clf4],
                           meta_regressor=svr_linear)

    sr.fit(X_train, y_train)
    result = sr.predict(X_train)
    score = get_rmse_score(result, y_train)
    print("RMSE Score train: %.4f" % score)
    return sr
Esempio n. 25
0
    def train(self, X, y):
        features = X
        labels = y

        #test train split
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=4)

        #Ridge
        regcv = linear_model.RidgeCV(
            alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75])
        regcv.fit(features, labels)
        regcv.alpha_
        reg = linear_model.Ridge(alpha=regcv.alpha_)
        reg.fit(features, labels)

        # GB
        params = {
            'n_estimators': 100,
            'max_depth': 5,
            'min_samples_split': 2,
            'learning_rate': 0.1,
            'loss': 'ls'
        }
        gbr = ensemble.GradientBoostingRegressor(**params)
        gbr.fit(features, labels)

        #blended model
        meta = linear_model.LinearRegression()
        blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta)
        _ = blender.fit(features, labels)
        y_pred = blender.predict(X_test)

        print "***** TRAINING STATS ********"
        scores = cross_val_score(blender, features, labels, cv=10)
        print("Accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        mean_diff = np.mean(np.abs(np.exp(Y_test) - np.exp(y_pred)))
        p_mean_diff = np.mean(mean_diff / np.exp(Y_test))
        print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff * 100)
        print "***** TRAINING STATS ********"

        return blender
Esempio n. 26
0
  def train(self, X,y):
    features = X
    labels = y

    #test train split
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=4)

    #Ridge
    regcv = linear_model.RidgeCV(alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75])
    regcv.fit(features, labels)
    regcv.alpha_  
    reg = linear_model.Ridge(alpha=regcv.alpha_)
    reg.fit(features, labels)

    # GB
    params = {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2,
              'learning_rate': 0.1, 'loss': 'ls'}
    gbr = ensemble.GradientBoostingRegressor(**params)
    gbr.fit(features, labels)


    #blended model
    meta = linear_model.LinearRegression()
    blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta)
    _=blender.fit(features, labels)
    y_pred = blender.predict(X_test)

    print "***** TRAINING STATS ********"
    scores = cross_val_score(blender, features, labels, cv=10)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    mean_diff = np.mean(np.abs(np.exp(Y_test)-np.exp(y_pred)))
    p_mean_diff = np.mean(mean_diff/np.exp(Y_test))
    print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff*100)
    print "***** TRAINING STATS ********"
    
    return blender
Esempio n. 27
0
    def stackModel(self):
        train_X = self.X.as_matrix()
        train_Y = self.Y.as_matrix()

        test_X = self.Test.as_matrix()

        # train_X = data_scaler(train_X)

        X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1)

        gbdt = GradientBoostingRegressor(loss='ls', alpha=0.9,
                                         n_estimators=500,
                                         learning_rate=0.05,
                                         max_depth=8,
                                         subsample=0.8,
                                         min_samples_split=9,
                                         max_leaf_nodes=10)
        xgb = XGBRegressor(max_depth=5, n_estimators=500, learning_rate=0.05, silent=False)
        lr = LinearRegression()
        rfg = RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=11, min_samples_split=8,
                                    n_estimators=100)
        svr_rbf = SVR(kernel='rbf')

        stregr = StackingRegressor(regressors=[gbdt, xgb, lr, rfg], meta_regressor=svr_rbf)

        stregr.fit(X_train, y_train)
        stregr.predict(X_train)

        # Evaluate and visualize the fit

        print("Mean Squared Error: %.6f" % np.mean((stregr.predict(X_train) - y_train) ** 2) ** 0.5)
        error(stregr.predict(X_test), y_test)

        # online
        result = stregr.predict(test_X)
        save_to_file(result, self.uid, "../result/result_12.09_2_stacking.csv")

        with plt.style.context(('seaborn-whitegrid')):
            plt.scatter(X_train, y_train, c='lightgray')
            plt.plot(X_train, stregr.predict(X_train), c='darkgreen', lw=2)

        plt.show()
Esempio n. 28
0
plt.ylabel('Accuracy')
plt.show()

# In[368]:

from mlxtend.regressor import StackingRegressor
lr = LinearRegression()
sclf = StackingRegressor(regressors=[grid_search, abr, rfr], meta_regressor=lr)
print('3-fold cross validation:\n')
for clf, label in zip([grid_search, abr, rfr, sclf],
                      ['grid_search', 'abr', 'rfr', 'StackingClassifier']):
    scores = cross_val_score(clf, X, y)

    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))
sclf.fit(X_train, y_train)
predictions = sclf.predict(X_test)

# In[370]:

train_sizes, train_score, test_score = learning_curve(
    sclf, X, y, train_sizes=[0.1, 0.2, 0.4, 0.6, 0.8, 1], cv=3)
train_error = 1 - np.mean(train_score, axis=1)
test_error = 1 - np.mean(test_score, axis=1)
plt.plot(train_sizes, 1 - train_error, 'o-', color='r', label='training')
plt.plot(train_sizes, 1 - test_error, 'o-', color='g', label='testing')
plt.legend(loc='best')
plt.xlabel('traing examples')
plt.ylabel('Accuracy')
plt.show()
Esempio n. 29
0
                  num_boost_round=2889,
                  early_stopping_rounds=50,
                  evals=watchlist)

rfreg = RandomForestRegressor(random_state=1, max_depth=15)
ridge_reg = Ridge(normalize=True)
lasso_reg = Lasso()
linear_reg = LinearRegression(normalize=True)
stacking_reg = StackingRegressor(regressors=[rfreg, ridge_reg, lasso_reg],
                                 meta_regressor=linear_reg)

feature = [x for x in train_zero_var.columns if x not in ['Value']]
# X_train, X_test, y_train, y_test = train_test_split(train_zero_var[feature], train_zero_var['Value'], test_size=0.2,
#                                                     random_state=0)
stacking_reg.fit(X_train, y_train)
stacking_test = pd.DataFrame(stacking_reg.predict(X_test))
stacking_test.columns = ['stacking_pred']
y_test = pd.DataFrame(y_test)
y_test.columns = ['Value']
mean_squared_error(stacking_test['stacking_pred'], y_test['Value'])

train_zero_var = train_zero_var.reset_index()

# predict for Random Forest
rf_pred = pd.DataFrame()
for idx in range(0, 5):
    train = train_zero_var[train_zero_var['index'] % 5 != idx]
    test = train_zero_var[train_zero_var['index'] % 5 == idx]
    stacking_feature = [
        x for x in train.columns if x not in ['index', 'Value']
    ]
Esempio n. 30
0
                               nthread=-1)
gbm_b = GradientBoostingRegressor(learning_rate=0.05,
                                  n_estimators=2000,
                                  max_depth=4,
                                  max_features='log2',
                                  min_samples_leaf=15,
                                  min_samples_split=10,
                                  loss='huber')

stackmodel = StackingRegressor(
    regressors=[ElNet_b, lasso_b, ridge_b, svr_b, model_xgb_b, gbm_b],
    meta_regressor=Lasso(alpha=0.00035))

stackmodel.fit(x_train, y_train)

stacked = stackmodel.predict(x_test)
rmse_stacked = np.sqrt(mean_squared_error(y_train,
                                          stackmodel.predict(x_train)))
stacked_pred = np.expm1(stacked)

# Averaged model
ensembled = np.expm1((0.25 * ridge.predict(x_test).reshape(-1, 1)) +
                     (0.2 * ElNet.predict(x_test).reshape(-1, 1)) +
                     (0.2 * lasso.predict(x_test).reshape(-1, 1)) +
                     (0.15 * model_xgb.predict(x_test).reshape(-1, 1)) +
                     (0.2 * GBoost.predict(x_test).reshape(-1, 1)))

# Print the performance of each model
obj = pd.DataFrame([[
    score_ridge, rdg_trainRMSE, rdg_testRMSE,
    rmse_ridge_test - rmse_ridge_train, 0.11866
Esempio n. 31
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    features = [x for x in request.form.values()]
    #final_features = [np.array(int_features)]
    #prediction = model.predict(final_features)

    #output = round(prediction[0], 2)

    features = np.array(features)
    features = features.reshape(1, 6)
    features = pd.DataFrame(data=features,
                            columns=[
                                'Name', 'Genre', 'Comments', 'Likes',
                                'Popularity', 'Followers'
                            ])
    df = pd.read_csv('data.csv')
    cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int}
    df = df.astype(cv)
    features = features.astype(cv)
    #x=df[df['Views']==0].index

    df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Popularity']].index,
            axis=1,
            inplace=True)

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)]

    df = df.drop(
        columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index'])

    y = df['Views']
    df = df.drop(columns=['Views'])

    be = BinaryEncoder()
    df = be.fit_transform(df)
    f = be.transform(features)

    X = df.iloc[:, :]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    rg1 = AdaBoostRegressor()
    rg1.fit(X_train, y_train)
    #ypred=rg1.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)
    # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]}
    # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1)
    rg2.fit(X_train, y_train)
    #ypred=rg2.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15)
    # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]}
    # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1)
    rg3.fit(X_train, y_train)
    #ypred=rg3.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3)
    rg6.fit(X_train, y_train)
    #ypred=rg6.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))
    f = f.iloc[:, :]
    y_pred = rg6.predict(f)

    y_pred = y_pred.astype(int)

    return render_template(
        'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
Esempio n. 32
0
#==============================================================================
#   4) LGBMRegressor模型
#==============================================================================
#    from lightgbm import LGBMRegressor
#    
#    model_lgb = LGBMRegressor()
#==============================================================================
#     5) 融合模型
#==============================================================================
    from mlxtend.regressor import StackingRegressor    
#
    regressors = [model_xgb,model_rfg,model_gb]
    model = StackingRegressor(regressors=regressors, meta_regressor=model_xgb)

#    model = model_gb
    model.fit(train_text,train_labels)
    
#    print('The parameters of the best model are: ')
#    print(model.best_params_)
  
    preds = model.predict(train_text)
    print('The pearsonr of training set is {}'.format(pearsonr (list(train_labels), list(preds))[0]))
    print('The MSE of training set is {}'.format(mean_squared_error(list(train_labels), list(preds))))
      
    #==============================================================================
    # 预测 测试集
    #==============================================================================   
    preds = model.predict(test_text)
    
    print('The pearsonr of test set is {}'.format(pearsonr (list(test_labels), list(preds))[0]))
    print('The MSE of test set is {}'.format(mean_squared_error(list(test_labels), list(preds))))
Esempio n. 33
0
    'max_bin': 8192,
    'verbosity': 10
}

modelL1 = lgb.LGBMRegressor(**params)
modelL2 = lgb.LGBMRegressor(**params2)
metaregr = Ridge(solver="sag", max_iter=300)
stregr = StackingRegressor(regressors=[modelR1, modelR3, modelL1, modelL2],
                           meta_regressor=metaregr,
                           verbose=10)
stregr.fit(X, y)
print('Weights/Iter of Regressors=', stregr.coef_)
#preds = stregr.predict(X)
#print('RMSE=', mean_squared_error(y, preds)**0.5)

pred_test = stregr.predict(X_test)

submission['price'] = np.expm1(pred_test)
submission.to_csv("FF_LR_Meta.csv", index=False)

#==============================================================================
# GridSearch Cross Validation
# paramsGSCV = {
#           'ridge__alpha': [0.4, 1.0],
#           'lgbmregressor__max_depth': [4, 5],
#           'meta-ridge__alpha': [1.0]
#         }
# stregr.get_params().keys() #Print list of available parameters
# grid = GridSearchCV(estimator=stregr,
#                     param_grid=paramsGSCV,
#                     cv=4,
Esempio n. 34
0
y[::5] += 3 * (0.5 - np.random.rand(8))

# Initializing models

lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')

stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                           meta_regressor=svr_rbf)

# Training the stacking classifier

stregr.fit(X, y)
stregr.predict(X)

# Evaluate and visualize the fit

print("Mean Squared Error: %.4f" % np.mean((stregr.predict(X) - y)**2))
print('Variance Score: %.4f' % stregr.score(X, y))

with plt.style.context(('seaborn-whitegrid')):
    plt.scatter(X, y, c='lightgray')
    plt.plot(X, stregr.predict(X), c='darkgreen', lw=2)

plt.show()

# Example 2 - Stacked Regression and GridSearch

from sklearn.model_selection import GridSearchCV
Esempio n. 35
0
def main():
    print("Reading in Data")
    #最终预测
    train = pd.read_csv('cleaned_train20180129_111517.csv')
    test = pd.read_csv('cleaned_test20180129_111517.csv')

    #验证A榜结果
    #train = pd.read_csv('cleaned_train20180129_102513.csv')
    #test = pd.read_csv('cleaned_test20180129_102513.csv')

    test = test.drop(['id'], axis=1)
    train = train.drop(['id'], axis=1)
    y_train = train['血糖']

    #pred_proba为测试集血糖权重
    threshold = 6.5
    test_num = len(test)
    train_num = len(train)
    bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat, X_train, pred_proba = fuck_columns(
        train, test, threshold)

    print("linear model 开始训练")
    pred_bigger, pred_less, linear_bigger, linear_less = linear_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    linear_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("lasso model 开始训练")
    pred_bigger, pred_less, lasso_bigger, lasso_less = lasso_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    lasso_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("ENet model 开始训练")
    pred_bigger, pred_less, ENet_bigger, ENet_less = ENet_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    ENet_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("集成模型开始训练...")
    print("RandomForestRegressor...")
    pred_bigger, pred_less, rf_bigger, rf_less = rf_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    rf_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("GradientBoostingRegressor...")
    pred_bigger, pred_less, gb_bigger, gb_less = GBoost_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    gb_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("LGBMRegressor...")
    pred_bigger, pred_less, lgb_bigger, lgb_less = LGBM_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    lgb_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    print("XGBRegressor...")
    pred_bigger, pred_less, xgb_bigger, xgb_less = xgb_model(
        bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat)
    #预测结果结合权重
    xgb_pred_res = np.array([
        pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1]
        for i in range(test_num)
    ])
    '''
    Stacking Learning
    '''
    print("StackingRegressor...")
    stacked_averaged_bigger_models = StackingRegressor(
        regressors=[linear_bigger, lasso_bigger, ENet_bigger],
        meta_regressor=gb_bigger)
    stacked_averaged_less_models = StackingRegressor(
        regressors=[linear_less, lasso_less, ENet_less],
        meta_regressor=gb_less)
    #拟合模型
    stacked_averaged_bigger_models.fit(bigger_thr_X, bigger_thr_y)
    stacked_averaged_less_models.fit(less_thr_X, less_thr_y)
    #测试集预测
    stacked_bigger_pred = stacked_averaged_bigger_models.predict(test_concat)
    stacked_less_pred = stacked_averaged_less_models.predict(test_concat)
    #预测结果结合权重
    stacked_pred_res = np.array([
        stacked_less_pred[i] * pred_proba[i][0] +
        stacked_bigger_pred[i] * pred_proba[i][1] for i in range(test_num)
    ])

    ensemble = stacked_pred_res * 0.40 + xgb_pred_res * 0.40 + lgb_pred_res * 0.20
    #stacking融合linear
    new_ensemble = np.array([
        linear_pred_res[i] * pred_proba[i][0] + ensemble[i] * pred_proba[i][1]
        for i in range(test_num)
    ])

    sub = pd.DataFrame({'pred': ensemble})
    sub_wig = pd.DataFrame({'pred': new_ensemble})
    sub.to_csv('submission_b.csv', header=None, index=False)
    sub_wig.to_csv('submission_b_wig.csv', header=None, index=False)
Esempio n. 36
0
def useXYtrain(x, y, times):
    flag = 0
    for i in range(0, len(Selected_learnerCode)):
        if Selected_learnerCode[i] != '':
            flag += 1
    if flag == 0:
        print('No proper learner\n')
        return
    stacking_MSE = [[], [], [], [], [], []]
    MSE = [[], [], [], [], [], [], []]
    R_square = [[], [], [], [], [], [], [], []]

    Ada_MSE = []
    Ada_r_square = []

    for i in range(0, times):
        print('第' + str(i + 1) + '次试验:\n')
        Learners_map = {}
        Learners = []
        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.20)
        svr = SVR(C=1.0, epsilon=0.2)
        parameters = {
            'C': np.logspace(-3, 3, 7),
            'gamma': np.logspace(-3, 3, 7)
        }
        print("GridSearch starting...")
        clfsvr = GridSearchCV(svr,
                              parameters,
                              n_jobs=-1,
                              scoring='neg_mean_squared_error')
        clfsvr.fit(X_train, y_train)

        print('The parameters of the best model are: ')
        print(clfsvr.best_params_)
        y_pred = clfsvr.best_estimator_.predict(X_test)
        # drawTrain(y_pred, y_test, 'SVR', i)
        # SVR_MSE.append(mean_squared_error(y_test, y_pred))

        yy = clfsvr.best_estimator_.predict(x)
        R_square[0].append(drawTrain(y, yy, 'SVR', i))
        MSE[0].append(mean_squared_error(y_test, y_pred))

        if 'SVR' in Selected_learnerCode:
            print('SVR Mean squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            Learners.append(clfsvr.best_estimator_)

        Learners_map['SVR'] = svr
        """ann = Regressor(layers = [Layer("Sigmoid", units=14),
                                   Layer("Linear")],
                         learning_rate = 0.02,
                         random_state = 2018,
                         n_iter = 10)

        ann.fit(X_train,y_train)
        y_pred = ann.predict(X_test)
        print('ANN Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")"""

        parameters = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000]}
        rfr = RandomForestRegressor(n_estimators=200, random_state=0)
        # drawTrain(rfr, x, y, 'RFR', i)
        # rfr = RandomForestRegressor(n_estimators=200, random_state=0)
        clfrfr = GridSearchCV(rfr,
                              parameters,
                              n_jobs=-1,
                              scoring='neg_mean_squared_error')
        clfrfr.fit(X_train, y_train)
        print('The parameters of the best model are: ')
        print(clfrfr.best_params_)
        y_pred = clfrfr.best_estimator_.predict(X_test)
        yy = clfrfr.best_estimator_.predict(x)
        MSE[1].append(mean_squared_error(y_test, y_pred))
        R_square[1].append(drawTrain(y, yy, 'RFR', i))
        # RFR_MSE.append(mean_squared_error(y_test, y_pred))

        if 'RFR' in Selected_learnerCode:
            print('RFR Mean squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            Learners.append(clfrfr.best_estimator_)

        Learners_map['RFR'] = rfr

        parameters = {'alpha': np.logspace(-2, 2, 5)}
        lasso = Lasso(alpha=0.05, random_state=1, max_iter=1000)
        # drawTrain(lasso, x, y, 'LASSO', i)
        clflasso = GridSearchCV(lasso,
                                parameters,
                                n_jobs=-1,
                                scoring='neg_mean_squared_error')
        clflasso.fit(X_train, y_train)
        yy = clflasso.best_estimator_.predict(x)
        print('The parameters of the best model are: ')
        print(clflasso.best_params_)
        y_pred = clflasso.best_estimator_.predict(X_test)
        R_square[2].append(drawTrain(y, yy, 'LASSO', i))
        MSE[2].append(mean_squared_error(y_test, y_pred))

        if 'LASSO' in Selected_learnerCode:
            print('LASSO  Mean squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            # file.write('LASSO  Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")
            Learners.append(clflasso.best_estimator_)

        Learners_map['LASSO'] = lasso

        # drawTrain(ENet, X_train, y_train,X_test,y_test, 'Elastic NET', i)
        parameters = {
            'alpha': np.logspace(-2, 2, 5),
            'l1_ratio': np.linspace(0, 1.0, 11)
        }
        # ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3)
        # drawTrain(ENet, x, y, 'Elastic NET', i)
        ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3)
        clfENet = GridSearchCV(ENet,
                               parameters,
                               n_jobs=-1,
                               scoring='neg_mean_squared_error')
        clfENet.fit(X_train, y_train)
        print('The parameters of the best model are: ')
        print(clfENet.best_params_)
        y_pred = clfENet.best_estimator_.predict(X_test)
        yy = clfENet.best_estimator_.predict(x)
        MSE[3].append(mean_squared_error(y_test, y_pred))
        R_square[3].append(drawTrain(y, yy, 'Elastic Net', i))
        if 'ENET' in Selected_learnerCode:
            print('Elastic Net Mean squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            Learners.append(clfENet.best_estimator_)

        Learners_map['ENET'] = ENet

        parameters = {'n_estimators': [100, 500, 1000, 2000, 3000, 5000]}
        GBoost = GradientBoostingRegressor(n_estimators=3000,
                                           learning_rate=0.05,
                                           max_depth=4,
                                           max_features='sqrt',
                                           min_samples_leaf=15,
                                           min_samples_split=10,
                                           loss='huber',
                                           random_state=5)
        clfGBoost = GridSearchCV(GBoost,
                                 parameters,
                                 n_jobs=-1,
                                 scoring='neg_mean_squared_error')
        clfGBoost.fit(X_train, y_train)
        print('The parameters of the best model are: ')
        print(clfGBoost.best_params_)
        y_pred = clfGBoost.best_estimator_.predict(X_test)
        yy = clfGBoost.best_estimator_.predict(x)
        MSE[4].append(mean_squared_error(y_test, y_pred))
        # GBoost_MSE.append(mean_squared_error(y_test, y_pred))
        R_square[4].append(drawTrain(y, yy, 'Gradient Boosting', i))
        if 'GBOOST' in Selected_learnerCode:
            print('GBoost squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            Learners.append(clfGBoost.best_estimator_)

        Learners_map['GBOOST'] = GBoost

        # Adaboost
        # Adaboost = AdaBoostRegressor(base_estimator=SVR(C=1.0, epsilon=0.2))
        Adaboost = AdaBoostRegressor()
        Adaboost.fit(X_train, y_train)
        y_pred = Adaboost.predict(X_test)
        yy = Adaboost.predict(x)
        R_square[5].append(drawTrain(y, yy, 'Adaboost', i))
        print('Adaboost with SVR squared error is ' +
              str(mean_squared_error(y_test, y_pred)) + "\n")
        Ada_MSE.append(mean_squared_error(y_test, y_pred))

        # BAGGING
        baggingModel = baggingAveragingModels(
            models=(clfsvr.best_estimator_, clfrfr.best_estimator_,
                    clfENet.best_estimator_, clfGBoost.best_estimator_,
                    clflasso.best_estimator_))
        baggingModel.fit(X_train, y_train)
        y_pred = baggingModel.predict(X_test)
        MSE[5].append(mean_squared_error(y_test, y_pred))
        yy = baggingModel.predict(x)
        R_square[6].append(drawTrain(y, yy, 'Bagging', i))
        print('Bagging before selected squared error is ' +
              str(mean_squared_error(y_test, y_pred)) + "\n")

        baggingModel = baggingAveragingModels(models=tuple(Learners))
        # drawTrain(baggingModel, X_train, y_train,X_test,y_test, 'Bagging', i)
        # baggingModel = baggingAveragingModels(models=tuple(Learners))

        baggingModel.fit(X_train, y_train)
        y_pred = baggingModel.predict(X_test)
        MSE[6].append(mean_squared_error(y_test, y_pred))
        yy = baggingModel.predict(x)
        R_square[7].append(drawTrain(y, yy, 'Bagging', i))

        print('Bagging after selected squared error is ' +
              str(mean_squared_error(y_test, y_pred)) + "\n")
        stacking_R_square = [[], [], [], [], [], []]
        All_learner = ['SVR', 'RFR', 'LASSO', 'ENET', 'GBOOST']
        for k in range(0, len(Selected_learnerCode)):
            """learnerList = []
            for kk in range(0,len(Selected_learnerCode)):
                if Selected_learnerCode[kk]!='' :
                    learnerList.append(Learners_map[Selected_learnerCode[kk]])"""
            """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList),
                                                             meta_model=Learners_map[All_learner[k]])
            drawTrain(stacked_averaged_models, X_train, y_train,X_test,y_test, 'stacking with '+All_learner[k], i)"""
            # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList),
            #                                                 meta_model=Learners_map[All_learner[k]])
            params = {}
            """
            if 'SVR' in Selected_learnerCode:
                params['svr__C'] = np.logspace(-3, 3, 7)
                params['svr__gamma'] = np.logspace(-3, 3, 7)

            if 'RFR' in Selected_learnerCode:
                params['randomforestregressor__n_estimators'] =[10, 50, 100, 500, 1000]

            if 'LASSO' in Selected_learnerCode:
                params['lasso__alpha'] = np.logspace(-2, 2, 5)

            if 'ENET' in Selected_learnerCode:
                params['elasticnet__alpha'] = np.logspace(-2, 2, 5)

            if 'GBOOST' in Selected_learnerCode:
                params['gradientboostingregressor__n_estimators']= [100, 500, 1000, 2000, 3000, 5000]"""

            if k == 0:
                params['meta-svr__C'] = np.logspace(-3, 3, 7)
                params['meta-svr__gamma'] = np.logspace(-3, 3, 7)
            if k == 1:
                params['meta-randomforestregressor__n_estimators'] = [
                    10, 50, 100, 500, 1000
                ]
            if k == 2:
                params['meta-lasso__alpha'] = np.logspace(-2, 2, 5)
            if k == 3:
                params['meta-elasticnet__alpha'] = np.logspace(-2, 2, 5)
            if k == 4:
                params['meta-gradientboostingregressor__n_estimators'] = [
                    100, 500, 1000, 2000, 3000, 5000
                ]
            """
            params = {'svr__C': np.logspace(-3, 3, 7),
                      'svr__gamma': np.logspace(-3, 3, 7),
                      'randomforestregressor__n_estimators': [10, 50, 100, 500, 1000],
                      'lasso__alpha': np.logspace(-2, 2, 5),
                      'elasticnet__alpha':np.logspace(-2, 2, 5),
                      'gradientboostingregressor__n_estimators': [100, 500, 1000, 2000, 3000, 5000],
                      }"""
            stacked_averaged_models = StackingRegressor(
                regressors=Learners,
                meta_regressor=Learners_map[All_learner[k]])
            grid = GridSearchCV(estimator=stacked_averaged_models,
                                param_grid=params)
            grid.fit(X_train, y_train)
            y_pred = grid.best_estimator_.predict(X_test)
            yy = grid.best_estimator_.predict(x)
            stacking_R_square[k].append(
                drawTrain(y, yy, 'stacking with ' + All_learner[k], i))
            print('Stacking with metamodel is ' + All_learner[k] +
                  ' squared error is ' +
                  str(mean_squared_error(y_test, y_pred)) + "\n")
            # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")
            stacking_MSE[k].append(mean_squared_error(y_test, y_pred))

        # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList),
        #                                                 meta_model=baggingModel)
        # drawTrain(stacked_averaged_models, X_train, y_train, X_test, y_test, 'stacking with Bagging models'  , i)
        """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList),
                                                         meta_model=Learners_map[All_learner[k]])"""
        stacked_averaged_models = StackingRegressor(
            regressors=Learners, meta_regressor=baggingModel)
        # grid = GridSearchCV(estimator=stacked_averaged_models, param_grid=params)
        stacked_averaged_models.fit(X_train, y_train)
        y_pred = stacked_averaged_models.predict(X_test)
        yy = stacked_averaged_models.predict(x)
        stacking_R_square[5].append(
            drawTrain(y, yy, 'stacking with bagging', i))
        print('Stacking with metamodel is bagging models squared error is ' +
              str(mean_squared_error(y_test, y_pred)) + "\n")
        # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")
        stacking_MSE[5].append(mean_squared_error(y_test, y_pred))

        gc.collect()

    print("Adaboost mean is " + str(np.mean(Ada_MSE)))

    min_stacking_MSE = []
    for i in range(0, times):
        minMSE = stacking_MSE[0][i]
        for j in range(1, 6):
            if stacking_MSE[j][i] < minMSE:
                minMSE = stacking_MSE[j][i]
        min_stacking_MSE.append(minMSE)

    plot_x = np.linspace(1, times, times)
    if len(MSE[0]) > 0:
        plt.plot(plot_x, MSE[0], 'b')
    if len(MSE[1]) > 0:
        plt.plot(plot_x, MSE[1], 'r')
    if len(MSE[2]) > 0:
        plt.plot(plot_x, MSE[2], 'y')
    if len(MSE[3]) > 0:
        plt.plot(plot_x, MSE[3], 'k')
    if len(MSE[4]) > 0:
        plt.plot(plot_x, MSE[4], 'g')
    if len(MSE[5]) > 0:
        plt.plot(plot_x, MSE[5], 'm')
    if len(MSE[6]) > 0:
        plt.plot(plot_x, MSE[6], color='coral', linestyle=':', marker='|')
    plt.plot(plot_x, min_stacking_MSE, color='cyan')
    plt.xlabel('Repeat times')
    plt.ylabel('MSE')
    plt.legend(
        ('SVR avg = ' + str(np.mean(MSE[0])), 'RFR avg = ' +
         str(np.mean(MSE[1])), 'Lasso avg=' + str(np.mean(MSE[2])),
         'Enet avg=' + str(np.mean(MSE[3])), 'Gboost avg = ' +
         str(np.mean(MSE[4])), 'Bagging before avg = ' + str(np.mean(MSE[5])),
         'Bagging after avg = ' + str(np.mean(MSE[6])),
         'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))),
        loc='upper right')
    plt.title('Different learning machine')
    plt.savefig('DifferentLearner.png')
    plt.clf()
    plt.plot()

    plot_x = np.linspace(1, times, times)
    plt.plot(plot_x, Ada_MSE, 'b')
    plt.plot(plot_x, MSE[6], 'r')
    plt.plot(plot_x, min_stacking_MSE, 'g')
    plt.legend(('Adaboost avg = ' + str(np.mean(Ada_MSE)),
                'Bagging avg = ' + str(np.mean(MSE[6])),
                'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))),
               loc='upper right')
    plt.title('Bagging VS St-LIBS VS Adaboost')
    plt.xlabel('Repeat times')
    plt.ylabel('MSE')
    plt.savefig('Bagging VS St-LIBS&Adaboost.png')
    plt.clf()
    plt.plot()

    plot_x = np.linspace(1, times, times)
    if len(stacking_MSE[0]) > 0:
        plt.plot(plot_x, stacking_MSE[0], 'b')
    if len(stacking_MSE[1]) > 0:
        plt.plot(plot_x, stacking_MSE[1], 'r')
    if len(stacking_MSE[2]) > 0:
        plt.plot(plot_x, stacking_MSE[2], 'y')
    if len(stacking_MSE[3]) > 0:
        plt.plot(plot_x, stacking_MSE[3], 'k')
    if len(stacking_MSE[4]) > 0:
        plt.plot(plot_x, stacking_MSE[4], 'g')
    if len(stacking_MSE[5]) > 0:
        plt.plot(plot_x, stacking_MSE[5], 'm')
    plt.legend(('SVR avg = ' + str(np.mean(stacking_MSE[0])),
                'RFR avg = ' + str(np.mean(stacking_MSE[1])),
                'Lasso avg=' + str(np.mean(stacking_MSE[2])),
                'Enet avg=' + str(np.mean(stacking_MSE[3])),
                'Gboost avg = ' + str(np.mean(stacking_MSE[4])),
                'Bagging avg = ' + str(np.mean(stacking_MSE[5]))),
               loc='upper right')
    plt.title('Different meta-learning machine(Adaboost avg MSE=' +
              str(np.mean(Ada_MSE)) + ')')
    plt.xlabel('Repeat times')
    plt.ylabel('MSE')
    plt.savefig('DifferentMetaLearner.png')
    plt.clf()
    plt.plot()
    index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING1', 'BAGGING2']
    mse_file = pd.DataFrame(index=index, data=MSE)
    mse_file.to_csv('MSE.csv', encoding='utf-8')

    index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING']
    mse_file = pd.DataFrame(index=index, data=stacking_MSE)
    mse_file.to_csv('stacking_MSE.csv', encoding='utf-8')

    mse_file = pd.DataFrame(data=min_stacking_MSE)
    mse_file.to_csv('min_stacking_MSE.csv', encoding='utf-8')

    index = [
        'SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'Adaboost', 'BAGGING1',
        'BAGGING2'
    ]
    r_file = pd.DataFrame(index=index, data=R_square)
    r_file.to_csv('R_square.csv', encoding='utf-8')

    index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING']
    mse_file = pd.DataFrame(index=index, data=stacking_R_square)
    mse_file.to_csv('stacking_R_square.csv', encoding='utf-8')
Esempio n. 37
0
y[::5] += 3 * (0.5 - np.random.rand(8))

# Initializing models

lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')

stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
                           meta_regressor=svr_rbf)

# Training the stacking classifier

stregr.fit(X, y)
stregr.predict(X)

# Evaluate and visualize the fit

print("Mean Squared Error: %.4f"
      % np.mean((stregr.predict(X) - y) ** 2))
print('Variance Score: %.4f' % stregr.score(X, y))

with plt.style.context(('seaborn-whitegrid')):
    plt.scatter(X, y, c='lightgray')
    plt.plot(X, stregr.predict(X), c='darkgreen', lw=2)

plt.show()

print(stregr)
Esempio n. 38
0
model_rf = RandomForestRegressor(n_estimators=200,
                                 max_features=0.26326530612244903,
                                 criterion='mse')
model_extra_tree = ExtraTreesRegressor(n_estimators=200, criterion='mse')
model_gb = GradientBoostingRegressor(n_estimators=100,
                                     max_depth=5,
                                     random_state=43)
model_lr = LinearRegression()
svr_rbf = SVR(kernel='rbf')
svr_lin = SVR(kernel='linear')
ridge = Ridge()
model_xgb2 = XGBRegressor(max_depth=10, n_estimators=100)
model_vote = VotingClassifier(
    estimators=[('xgb', model_xgb), ('rf', model_rf), ('gb', model_gb)])
sclf = StackingRegressor(regressors=[model_extra_tree, model_xgb2, model_rf],
                         meta_regressor=model_lr)

time_split = TimeSeriesSplit(n_splits=5)
print cross_val_score(sclf,
                      X=train.as_matrix(),
                      y=target.as_matrix(),
                      scoring=SMAPE,
                      cv=time_split).mean()

sclf.fit(X=train, y=target)
preds = sclf.predict(test)
sample_submission['y'] = preds
print sample_submission[sample_submission['y'] < 0]
sample_submission['y'] = sample_submission['y'].map(lambda x: x
                                                    if x > 0 else 0.0)
sample_submission.to_csv("my_submission_24_2.tsv", sep=',', index=False)