Beispiel #1
0
            print('In-sample R2 score for XGB:')
            print(r2_score(y_train, model.predict(dtrain)))

            print("OOS R2 score for XGB:")
            r2 = r2_score(dvalid.get_label(), model.predict(dvalid))
            print(r2)
            '''Train the stacked models then predict the test data'''

            stacked_pipeline = make_pipeline(
                StackingEstimator(estimator=LassoLarsCV(normalize=True)),
                StackingEstimator(
                    estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                        loss="huber",
                                                        max_depth=3,
                                                        max_features=0.55,
                                                        min_samples_leaf=18,
                                                        min_samples_split=14,
                                                        subsample=0.7)),
                #StackingEstimator(estimator=BayesianRidge()),
                #StackingEstimator(estimator=ElasticNetCV()),
                #StackingEstimator(estimator=HuberRegressor()),
                #StackingEstimator(estimator=LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=True, precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=1, positive=False, random_state=None, selection='cyclic')),
                #StackingEstimator(estimator=LassoLarsIC()),
                #StackingEstimator(estimator=LinearRegression()),
                #StackingEstimator(estimator=OrthogonalMatchingPursuitCV()),
                #StackingEstimator(estimator=RANSACRegressor()),
                #tackingEstimator(estimator=RidgeCV()),
                #LassoLarsCV()      # .6
                #LinearRegression() # .6
                #ElasticNetCV()     # worse
# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))

--------------------------------------------------
# Exercise_11 
#1
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)
#2
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))
#3
Beispiel #3
0
    def feature_selection(self, X, y, method):
        """
        purpose:    select feature
        input:  X:train data
                y:lable
                method: uesed method
        return:
        """
        X_indices = np.arange(X.shape[-1])

        score = []

        # Removing features with low variance

        # correlation coefficient
        # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # mutual information
        # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # Univariate feature selection (for classification)
        if method == 'chi-squared':
            skb = SelectKBest(chi2)
            skb.fit_transform(X, y)
            score = skb.scores_

        # Univariate feature selection (for regression)
        if method == 'f_regression':
            skb = SelectKBest(f_regression)
            skb.fit_transform(X, y)
            score = skb.scores_

        # L1-based feature selection (for classification)
        if method == 'LinearSVC':
            lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
            sfm = SelectFromModel(lsvc, prefit=True)
            X_new = sfm.transform(X)

        # L1-based feature selection (for regression)
        elif method == 'LassoCV':
            lasso = LassoCV().fit(X, y)
            score = lasso.coef_
            sfm = SelectFromModel(lasso, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classification)
        elif method == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier()
            clf = clf.fit(X, y)
            print clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'ExtraTreesRegressor':
            clf = ExtraTreesRegressor()
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classifier)
        elif method == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'GradientBoostingRegressor':
            clf = GradientBoostingRegressor(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Print the feature ranking
        indices = np.argsort(score)[::-1]
        print("Feature ranking:")
        for f in X_indices:
            print("feature %d: %s  (%f)" %
                  (indices[f], self.columns[indices[f]], score[indices[f]]))

        #draw plot
        plt.figure()
        # plt.bar(indices, score, width=0.2, color='r')
        plt.barh(indices, score, height=0.2, color='r')
        plt.title(method)
        plt.xlabel("score")
        plt.ylabel("feature")
        plt.grid(axis='x')
        plt.show()

        pass
Beispiel #4
0
def GenerateGradientBoostModel(X_train, Y_train):
    gradient_boost_reg = GradientBoostingRegressor(n_estimators=300,
                                                   learning_rate=0.05,
                                                   random_state=0)
    grad_boost_model = gradient_boost_reg.fit(X_train, Y_train)
    return grad_boost_model
Beispiel #5
0
df_regOutput = pd.DataFrame()
df_booleanOutput = pd.DataFrame()

df = pd.read_csv('data/2007_weather_preprocess.csv')
#df, attributes = preprocess.preprocess(df)
attributes = list(df.columns.values)[1:]
attributes.remove('DateTime')
attributes.remove('PredDelay')

# Initialise Regressors
regressors = {
    'gbr_reg':
    GradientBoostingRegressor(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=1,
                              random_state=0,
                              loss='ls'),
    'ada_reg':
    AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                      n_estimators=300,
                      random_state=np.random.RandomState(1))
}

# Initialise Classifiers
classifiers = {
    'svm_clf': svm.SVC(),
    'bernolli_rbm_clf': BernoulliRBM(n_components=2),
    'decision_tree_clf': tree.DecisionTreeClassifier()
}
Beispiel #6
0
def model_training_regressor(X, Y, test_ratio, verbose_mode, name):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_ratio,
                                                        shuffle=False)

    if name == "MLP":
        model = MLPRegressor(hidden_layer_sizes=(200, 50),
                             activation='relu',
                             solver='adam',
                             alpha=0.0002,
                             batch_size='auto',
                             learning_rate='adaptive',
                             learning_rate_init=0.01,
                             power_t=0.5,
                             max_iter=10000,
                             shuffle=True,
                             random_state=None,
                             tol=0.0001,
                             verbose=verbose_mode,
                             warm_start=False,
                             momentum=0.9,
                             nesterovs_momentum=True,
                             early_stopping=False,
                             validation_fraction=0.1,
                             beta_1=0.9,
                             beta_2=0.999,
                             epsilon=1e-08,
                             n_iter_no_change=10).fit(X_train, y_train)

        return get_model_performance(model, X_test, y_test)

    elif name == "NaiveBayes":
        model = linear_model.BayesianRidge().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "SVM":
        model = svm.LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "DT":
        model = tree.DecisionTreeRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "KNN":
        model = neighbors.KNeighborsRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "RandomForest":
        model = RandomForestRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "Adaboost":
        model = AdaBoostRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "GradientBoost":
        model = GradientBoostingRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    else:
        ret = dict()
        print("no available model")
        return ret
Beispiel #7
0
    def train(self):
        if self.config['method'] == 'regression':
            print('Building regression model')
            print('Fetching data')
            self.get_df_reg()
            print('Data Fetched')
            print('Splitting data')
            df_x = self.df_reg.iloc[:, 3:]
            df_y = self.df_reg.iloc[:, 1]
            x_train, x_test, y_train, y_test = train_test_split(df_x,
                                                                df_y,
                                                                test_size=0.2,
                                                                random_state=1)
            print('Data splitted')
            print('Size of x_train', x_train.shape)
            print('Size of y_train', y_train.shape)
            print('Size of x_test', x_test.shape)
            print('Size of y_test', y_test.shape)

            if self.config['model'] == 'svr':
                print('Support vector regressor')
                model = SVR(kernel=self.config['svr_kernel'])
            if self.config['model'] == 'knr':
                print('K-nearest neighbors regressor')
                model = KNeighborsRegressor(n_jobs=12)
            if self.config['model'] == 'dtr':
                print('Decision tree regressor')
                model = DecisionTreeRegressor()
            if self.config['model'] == 'rf':
                print('Random forest regressor')
                model = RandomForestRegressor(n_jobs=12)
            if self.config['model'] == 'et':
                print('Extra trees regressor')
                model = ExtraTreesRegressor(n_jobs=12)
            if self.config['model'] == 'gbr':
                print('Gradient boosting regressor')
                model = GradientBoostingRegressor()

            try:
                model
            except BaseException:
                print('Invalid model configuration. Check config.ini')
                return

            model.fit(x_train, y_train)
            pred = pd.Series(model.predict(df_x))
            self.df_reg.insert(2, 'Predicted_current', pred)
            print('R^2 score', model.score(x_test, y_test))

            print('Converting to binary classification')
            y_test_list, y_pred_list, _, _ = self.to_bin_cl(
                x_test, y_test, model)
            _, _, bin_y, bin_y_pred = self.to_bin_cl(df_x, df_y, model)
            conf_mat = confusion_matrix(y_true=y_test_list, y_pred=y_pred_list)
            print('Converted to binary classification')

            self.df_reg.insert(3, 'Actual_class', bin_y)
            self.df_reg.insert(4, 'Predicted_class', bin_y_pred)

            print('Confusion matrix:\n', conf_mat)
            p = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[1][0])
            r = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[0][1])
            print(
                'Accuracy is',
                np.sum(np.array(y_test_list) == y_pred_list) /
                len(y_pred_list))
            print('Precision is', p)
            print('Recall is', r)
            print('F1-score is', self.get_f_score(p, r, 1))
            print('F0.5-score is', self.get_f_score(p, r, 0.5))
            print('F2-score is', self.get_f_score(p, r, 2))

            # joblib.dump(model,'models/'+self.config['model']+'.model')
            self.save_result()
Beispiel #8
0
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100)

from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)
y2 = y - tree_reg1.predict(X)

tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)
y3 = y2 - tree_reg2.predict(X)

tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

from sklearn.ensemble import GradientBoostingRegressor

grbt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1)
grbt.fit(X, y)

grbt.predict(X_new)


grbt = GradientBoostingRegressor(max_depth=2, n_estimators=120)



Beispiel #9
0
    # assign to training data and test data
    for yyyy, month, tmax, tmin in data:

        if tmax.startswith("Missing"):
            max_test_X.append([int(yyyy), month_dict[month], float(tmin)])
        elif tmin.startswith("Missing"):
            min_test_X.append([int(yyyy), month_dict[month], float(tmax)])
        else:
            max_train_X.append([int(yyyy), month_dict[month], float(tmin)])
            max_train_y.append(float(tmax))
            min_train_X.append([int(yyyy), month_dict[month], float(tmax)])
            min_train_y.append(float(tmin))

    # training
    gbr_max = GradientBoostingRegressor()
    gbr_max.fit(max_train_X, max_train_y)

    gbr_min = GradientBoostingRegressor()
    gbr_min.fit(min_train_X, min_train_y)

    # predict
    #print(max_train_X)
    #print(max_test_X)
    index_max = 0
    index_min = 0
    for yyyy, month, tmax, tmin in data:
        if tmax.startswith("Missing"):
            y = gbr_max.predict([max_test_X[index_max]])
            print("%.1f" % y)
            index_max += 1
Beispiel #10
0
LinearRegressionModel = LinearRegressionModel.fit(train_X, train_y_ln)

RidgeModel = Ridge(normalize=True)
RidgeModel = RidgeModel.fit(train_X, train_y_ln)

LassoModel = Lasso().fit(train_X, train_y_ln)

# 非线性模型

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

DecisionTreeModel = DecisionTreeRegressor().fit(train_X, train_y_ln)
RandomForestModel = RandomForestRegressor().fit(train_X, train_y_ln)
GradientBoostingModel = GradientBoostingRegressor().fit(train_X, train_y_ln)

f = open('./models/LinearRegressionModel.pkl', 'xb')
pickle.dump(LinearRegressionModel, f)
f.close()

f = open('./models/RidgeModel.pkl', 'xb')
pickle.dump(RidgeModel, f)
f.close()

f = open('./models/LassoModel.pkl', 'xb')
pickle.dump(LassoModel, f)
f.close()

f = open('./models/DecisionTreeModel.pkl', 'wb')
pickle.dump(DecisionTreeModel, f)
print("RMSE for Test data = "+str(RMSE_test_RF))


# In[86]:


print(r2_score(y_train, pred_train_RF)) #train
print(r2_score(y_test, pred_test_RF)) #test


# # Gradient Boosting :

# In[87]:


fit_GB = GradientBoostingRegressor().fit(X_train, y_train)


# In[88]:


#prediction on train data
pred_train_GB = fit_GB.predict(X_train)

#prediction on test data
pred_test_GB = fit_GB.predict(X_test)


# In[89]:

### 2.RF
#Random Forest 一般在 max_features 设为 Feature 数量的平方根附近得到最佳结果。
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(train_x, train_y)

y_pre = rf.predict(val_x)
y_pre[y_pre > 0.5] = 1
y_pre[y_pre < 0.5] = 0

### 3.GBDT
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor(loss='ls',
                                 learning_rate=0.1,
                                 n_estimators=100,
                                 max_depth=3)
gbdt.fit(train_x, train_y)

y_pre = gbdt.predict(val_x)
y_pre[y_pre > 0.5] = 1
y_pre[y_pre < 0.5] = 0

### 4.knn
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=8, leaf_size=30, p=3)
knn.fit(x, y)

### 5.svm
#http://blog.csdn.net/u013709270/article/details/53365744 (d多分类)
Beispiel #13
0
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)

categorical = ['normalizeHolidayName', 'isPaidTimeOff']
numerical = ['vendorID', 'passengerCount', 'tripDistance', 'hour_of_day', 'day_of_week', 
             'day_of_month', 'month_num', 'snowDepth', 'precipTime', 'precipDepth', 'temperature']

numeric_transformations = [([f], Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])) for f in numerical]
    
categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical]

transformations = numeric_transformations + categorical_transformations

clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)),
                      ('regressor', GradientBoostingRegressor())])

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)
y_actual = y_test.values.flatten().tolist()
rmse = math.sqrt(mean_squared_error(y_actual, y_predict))
print('The RMSE score on test data for GradientBoostingRegressor: ', rmse)


# ## Global Explanation Using TabularExplainer
# 
# **Global Model Explanation** is a holistic understanding of how the model makes decisions. It provides you with insights on what features are most important and their relative strengths in making model predictions.
# 
# [TabularExplainer](https://docs.microsoft.com/en-us/python/api/azureml-explain-model/azureml.explain.model.tabularexplainer?view=azure-ml-py) uses one of three explainers: TreeExplainer, DeepExplainer, or KernelExplainer, and is automatically selecting the most appropriate one for our use case. You can learn more about the underlying model explainers at [Azure Model Interpretability](https://docs.microsoft.com/en-us/azure/machine-learning/service/machine-learning-interpretability-explainability).
# 
                                    cv=kf))
    return (rmse)


#LASSO Regression:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
#Elastic Net Regression
ENet = make_pipeline(RobustScaler(),
                     ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
#Kernel Ridge Regression :
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
#Gradient Boosting Regression:
GBoost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)
#XGBoost:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
                             gamma=0.0468,
                             learning_rate=0.05,
                             max_depth=3,
                             min_child_weight=1.7817,
                             n_estimators=2200,
                             reg_alpha=0.4640,
                             reg_lambda=0.8571,
                             subsample=0.5213,
                             silent=1,
                             random_state=7,
def BuildGBRT(train_samples,
              dev_samples,
              test_samples,
              model_path,
              n_calls,
              cast_to_zero=True,
              optimizer='gp',
              measurement_time='day',
              measurement_unit='$m^3/s$'):
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    sMin = train_samples.min(axis=0)
    sMax = train_samples.max(axis=0)
    norm = pd.concat([sMax, sMin], axis=1)
    norm = pd.DataFrame(norm.values,
                        columns=['sMax', 'sMin'],
                        index=train_samples.columns.values)
    norm.to_csv(model_path + 'norm.csv')
    joblib.dump(norm, model_path + 'norm.pkl')

    train_samples = 2 * (train_samples - sMin) / (sMax - sMin) - 1
    dev_samples = 2 * (dev_samples - sMin) / (sMax - sMin) - 1
    test_samples = 2 * (test_samples - sMin) / (sMax - sMin) - 1
    cal_samples = pd.concat([train_samples, dev_samples], axis=0)
    cal_samples = cal_samples.sample(frac=1)
    train_y = train_samples['Y']
    train_x = train_samples.drop('Y', axis=1)
    dev_y = dev_samples['Y']
    dev_x = dev_samples.drop('Y', axis=1)
    test_y = test_samples['Y']
    test_x = test_samples.drop('Y', axis=1)
    cal_y = cal_samples['Y']
    cal_x = cal_samples.drop('Y', axis=1)

    predictor_columns = list(train_x.columns)
    joblib.dump(predictor_columns, model_path + 'predictor_columns.pkl')

    # Get the feature num
    n_features = cal_x.shape[1]
    reg = GradientBoostingRegressor(n_estimators=100, random_state=0)
    # The list hyper-parameters we want
    space = [
        Integer(1, 25, name='max_depth'),
        Real(10**-5, 10**0, 'log-uniform', name='learning_rate'),
        Integer(1, n_features, name='max_features'),
        Integer(2, 100, name='min_samples_split'),
        Integer(1, 100, name='min_samples_leaf'),
    ]

    @use_named_args(space)
    def objective(**params):
        reg.set_params(**params)
        return -np.mean(
            cross_val_score(reg,
                            cal_x,
                            cal_y,
                            cv=10,
                            n_jobs=-1,
                            scoring='neg_mean_squared_error'))

    start = time.process_time()
    if optimizer == 'gp':
        res = gp_minimize(objective,
                          space,
                          n_calls=n_calls,
                          random_state=0,
                          verbose=True,
                          n_jobs=-1)
    elif optimizer == 'fr_bt':
        res = forest_minimize(objective,
                              space,
                              n_calls=n_calls,
                              base_estimator='ET',
                              random_state=0,
                              verbose=True,
                              n_jobs=-1)
    elif optimizer == 'fr_rf':
        res = forest_minimize(objective,
                              space,
                              n_calls=n_calls,
                              base_estimator='RF',
                              random_state=0,
                              verbose=True,
                              n_jobs=-1)
    elif optimizer == 'dm':
        res = dummy_minimize(objective, space, n_calls=n_calls)
    end = time.process_time()
    time_cost = end - start

    dump(res, model_path + 'tune_history.pkl', store_objective=False)
    # returned_results = load(model_path+'tune_history.pkl')
    DIMENSION_GBRT = [
        'max depth', 'learning rate', 'max features', 'min samples split',
        'min samples leaf'
    ]
    plot_objective_(res,
                    dimensions=DIMENSION_GBRT,
                    fig_savepath=model_path + 'objective.png')
    plot_evaluations_(res,
                      dimensions=DIMENSION_GBRT,
                      fig_savepath=model_path + 'evaluation.png')
    plot_convergence_(res, fig_savepath=model_path + 'convergence.png')

    # logger.info('Best score=%.4f'%res.fun)
    # logger.info("""Best parameters:
    # - max_depth=%d
    # - learning_rate=%.6f
    # - max_features=%d
    # - min_samples_split=%d
    # - min_samples_leaf=%d""" % (res.x[0], res.x[1], res.x[2], res.x[3],
    #                             res.x[4]))
    # logger.info('Time cost:{}'.format(time_cost))

    params_dict = {
        'max_depth': res.x[0],
        'learning_rate': res.x[1],
        'max_features': res.x[2],
        'min_samples_split': res.x[3],
        'min_samples_leaf': res.x[4],
        'time_cost': time_cost,
        'n_calls': n_calls,
    }

    params_df = pd.DataFrame(params_dict, index=[0])
    params_df.to_csv(model_path + 'optimized_params.csv')

    GBR = GradientBoostingRegressor(max_depth=res.x[0],
                                    learning_rate=res.x[1],
                                    max_features=res.x[2],
                                    min_samples_split=res.x[3],
                                    min_samples_leaf=res.x[4]).fit(
                                        cal_x, cal_y)

    joblib.dump(GBR, model_path + 'model.pkl')

    GBR = joblib.load(model_path + 'model.pkl')
    train_predictions = GBR.predict(train_x)
    dev_predictions = GBR.predict(dev_x)
    test_predictions = GBR.predict(test_x)
    train_y = (train_y.values).flatten()
    dev_y = (dev_y.values).flatten()
    test_y = (test_y.values).flatten()
    sMax = sMax[sMax.shape[0] - 1]
    sMin = sMin[sMin.shape[0] - 1]
    train_y = np.multiply(train_y + 1, sMax - sMin) / 2 + sMin
    dev_y = np.multiply(dev_y + 1, sMax - sMin) / 2 + sMin
    test_y = np.multiply(test_y + 1, sMax - sMin) / 2 + sMin
    train_predictions = np.multiply(train_predictions + 1,
                                    sMax - sMin) / 2 + sMin
    dev_predictions = np.multiply(dev_predictions + 1, sMax - sMin) / 2 + sMin
    test_predictions = np.multiply(test_predictions + 1,
                                   sMax - sMin) / 2 + sMin
    if cast_to_zero:
        train_predictions[train_predictions < 0.0] = 0.0
        dev_predictions[dev_predictions < 0.0] = 0.0
        test_predictions[test_predictions < 0.0] = 0.0
    dump_pred_results(
        path=model_path + 'opt_pred.csv',
        train_y=train_y,
        train_predictions=train_predictions,
        dev_y=dev_y,
        dev_predictions=dev_predictions,
        test_y=test_y,
        test_predictions=test_predictions,
        time_cost=time_cost,
    )
    plot_rela_pred(train_y,
                   train_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + 'TRAIN-PRED.png')
    plot_rela_pred(dev_y,
                   dev_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + "DEV-PRED.png")
    plot_rela_pred(test_y,
                   test_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + "TEST-PRED.png")
    plot_error_distribution(test_y,
                            test_predictions,
                            fig_savepath=model_path + "TEST-ERROR-DSTRI.png")
    plt.show()
    plt.close('all')
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((NFOLDS, ntest))
    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        #获取训练数据中的4折用于训练模型
        x_te = x_train[test_index]  #剩余一折用来预测
        clf.train(x_tr, y_tr)
        oof_train[test_index] = clf.predict(
            x_te)  #	训练数据的一折(剩余4折用于训练模型)#从而5次迭代后,对全部训练数据都进行了预测。
        oof_test_skf[i, :] = clf.predict(x_test)  #	全部的测试数据
    oof_test[:] = oof_test_skf.mean(axis=0)  #每个模型对测试数据预测了5次,取平均数
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et = ExtraTreeRegressor()
rr = RandomForestRegressor()
NN = NearestNeighbors()
x_train = train
x_test = test
et_oof_train, et_oof_test = get_oof(et, x_train, y_train,
                                    x_test)  # Extra Trees
rr_oof_train, rr_oof_test = get_oof(rr, x_train, y_train, x_test)
nn_oof_train, nn_oof_test = get_oof(NN, x_train, y_train, x_test)

x_train = np.concatenate((et_oof_train, rr_oof_train, nn_oof_train), axis=1)
x_test = np.concatenate((et_oof_test, rr_oof_test, nn_oof_test), axis=1)

gb = GradientBoostingRegressor().fit(x_train, y_train)
predictions = gb.predict(x_test)
Beispiel #17
0
def gbdt(x_train, y_train, x_test):
    model = GradientBoostingRegressor()
    model.fit(x_train, y_train)  # 线性回归建模
    predicted = model.predict(x_test)
    return (predicted)
#param_grid = {
#    'loss':['ls','lad','huber'],
#    'learning_rate': [0.01, 0.02, 0.05, 0.1,0.2],
#    'n_estimators': [100, 200, 400, 800, 1000],
#    'max_depth':[3,4,5,6],
#    'alpha':[0.7,0.8,0.9]}
##fit_params = {'categorical_feature':[2,3,4,5,6]}
#
#gbm = GridSearchCV(gbdt, param_grid)
#
#gbm.fit(X_train, y_train)
#print('Best parameters found by grid search are:', gbm.best_params_)

##模型训练
gbdt=GradientBoostingRegressor(loss='ls',learning_rate=0.2,n_estimators=1000,subsample=1,
                               min_samples_split=2,min_samples_leaf=1,max_depth=3,alpha=0.7,
                               verbose=0)

gbdt.fit(X_train,y_train)

#展示特征的重要性分布
feature_importance=gbdt.feature_importances_
plt.figure()
plt.scatter(np.arange(1,len(feature_importance)+1),feature_importance,c='r',zorder=10)
plt.plot(np.arange(1,len(feature_importance)+1),feature_importance)
plt.xlabel('Feature index')
plt.ylabel('Feature importance')


##训练部分的拟合效果展示
plt.figure()
filename = "blogData_train.csv"
train_data = pd.read_csv(filename, header=None)
#train_data = train_data.iloc[np.random.permutation(len(train_data))]
train_output = train_data[len(train_data.columns) - 1]
del train_data[len(train_data.columns) - 1]

filename = "blogData_test-2012.02.01.00_00.csv"
test_data = pd.read_csv(filename, header=None)
#test_data = test_data.iloc[np.random.permutation(len(test_data))]
test_output = test_data[len(test_data.columns) - 1]
del test_data[len(test_data.columns) - 1]

reg = LinearRegression()
rf = RandomForestRegressor()
gradBoost = GradientBoostingRegressor()
ada = AdaBoostRegressor()

#n_estimators=500

regressors = [reg, rf, gradBoost, ada]
regressor_names = [
    "Linear Regression", "Random Forests", "Gradient Boosting", "Adaboost"
]

#regressors = ada
#regressor_names = "Adaboost"

for regressor, regressor_name in zip(regressors, regressor_names):

    regressor.fit(train_data, train_output)
Beispiel #20
0
@author: sandra_chang
"""

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform
import numpy as np

wine = datasets.load_wine()
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4)

# 建立模型
clf = GradientBoostingRegressor(random_state=7)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.mean_squared_error(y_test, y_pred))

# 設定要訓練的超參數組合
n_estimators =  np.arange(20,200,20)
max_depth = np.arange(1,7)

#param_grid = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件,放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
random_search = RandomizedSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1, cv=3)
Beispiel #21
0
    def development_models_pred_test(self, plotTest):
        '''
        plotTest: Parameter to make the distribution plot of test or not
        This function develops the models and makes the predictions
        '''
        def __get_mape(y_true, y_pred):
            """
            Compute mean absolute percentage error (MAPE)
            """
            y_true, y_pred = np.array(y_true), np.array(y_pred)
            return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 4)

        estimatorXGB = {
            'random_state': [22],
            'max_depth': stats.randint(1, 100),
            'max_leaves': stats.randint(1, 100),
            'learning_rate': stats.uniform(0.1, 0.8),
            'min_child_weight': stats.randint(1, 100),
            'subsample': stats.uniform(0.1, 1),
            'n_estimators': stats.randint(1, 100)
        }
        model_xgbRandomSearch = RandomizedSearchCV(
            XGBRegressor(),
            estimatorXGB,
            scoring='neg_mean_squared_error',
            n_jobs=5,
            cv=5,
            random_state=22).fit(self.X_train_scaled, self.y_train_scaled)

        estimatorLGB = {
            'random_state': [22],
            'max_depth': stats.randint(1, 50),
            'num_leaves': stats.randint(1, 25),
            'max_leaves': stats.randint(1, 25),
            'learning_rate': stats.uniform(0.1, 1),
            'min_child_weight': stats.randint(1, 50),
            'subsample': stats.uniform(0.1, 1),
            'n_estimators': stats.randint(1, 100)
        }
        model_lgbRandomSearch = RandomizedSearchCV(LGBMRegressor(),
                                                   estimatorLGB,
                                                   scoring='r2',
                                                   n_jobs=5,
                                                   cv=5,
                                                   random_state=22).fit(
                                                       self.X_train_scaled,
                                                       self.y_train_scaled)

        estimatorLR = {'n_jobs': stats.randint(1, 5)}
        model_lrRandomSearch = RandomizedSearchCV(LinearRegression(),
                                                  estimatorLR,
                                                  scoring='r2',
                                                  n_jobs=5,
                                                  cv=5,
                                                  random_state=22).fit(
                                                      self.X_train_scaled,
                                                      self.y_train_scaled)

        estimatorGradBoost = {
            'random_state': [22],
            'n_estimators': stats.randint(1, 100),
            'max_depth': stats.randint(1, 50),
            'learning_rate': stats.uniform(0.1, 1),
            'min_weight_fraction_leaf': stats.uniform(0.1, 1),
            'min_samples_split': stats.randint(1, 100)
        }
        model_GradBoostRandomSearch = RandomizedSearchCV(
            GradientBoostingRegressor(),
            estimatorGradBoost,
            scoring='neg_mean_squared_error',
            n_jobs=5,
            cv=5,
            random_state=22).fit(self.X_train_scaled, self.y_train_scaled)

        estimatorRandForest = {
            'random_state': [22],
            'n_estimators': stats.randint(1, 100),
            'max_depth': stats.randint(1, 50),
            'min_samples_split': stats.randint(1, 100),
            'min_samples_leaf': stats.randint(1, 100),
            'max_leaf_nodes': stats.randint(1, 100)
        }
        model_RandForestRandomSearch = RandomizedSearchCV(
            RandomForestRegressor(),
            estimatorRandForest,
            scoring='neg_mean_squared_error',
            n_jobs=5,
            cv=5,
            random_state=22).fit(self.X_train_scaled, self.y_train_scaled)

        pred_train_xgb_scaled = model_xgbRandomSearch.predict(
            self.X_train_scaled)
        pred_train_xgb = pred_train_xgb_scaled * math.sqrt(
            self.scaler.var_[0]) + self.scaler.mean_[0]
        pred_train_lgb_scaled = model_lgbRandomSearch.predict(
            self.X_train_scaled)
        pred_train_lgb = pred_train_lgb_scaled * math.sqrt(
            self.scaler.var_[0]) + self.scaler.mean_[0]
        pred_train_lr_scaled = model_lrRandomSearch.predict(
            self.X_train_scaled)
        pred_train_lr = pred_train_lr_scaled * math.sqrt(
            self.scaler.var_[0]) + self.scaler.mean_[0]
        pred_train_GradBoost_scaled = model_GradBoostRandomSearch.predict(
            self.X_train_scaled)
        pred_train_GradBoost = pred_train_GradBoost_scaled * math.sqrt(
            self.scaler.var_[0]) + self.scaler.mean_[0]
        pred_train_RandForest_scaled = model_RandForestRandomSearch.predict(
            self.X_train_scaled)
        pred_train_RandForest = pred_train_RandForest_scaled * math.sqrt(
            self.scaler.var_[0]) + self.scaler.mean_[0]

        models = [
            model_xgbRandomSearch, model_lgbRandomSearch, model_lrRandomSearch,
            model_GradBoostRandomSearch, model_RandForestRandomSearch
        ]
        namesModels = ['xgb', 'lgb', 'lr', 'GradBoost', 'RandForest']

        def __pred_test(test, models, namesModels):

            for i, model in enumerate(models):
                var = 'pred_' + namesModels[i]
                pred = model.predict(self.X_test_scaled)
                test[var + '_scaled'] = pred
                test[var] = test[var + '_scaled'] * test[
                    self.varPredict + '_std'] + test[self.varPredict + '_mean']
                test.drop([var + '_scaled'], axis=1, inplace=True)

            return test

        test_copy = self.test.copy()
        test_copy = __pred_test(test_copy, models, namesModels)
        varsPred = [
            elem for elem in test_copy.columns if elem.__contains__('pred')
        ]
        test_copy['pred_ensamble'] = test_copy[varsPred].mean(axis=1)

        dfMetricsTrainTest = pd.DataFrame({
            'model':
            namesModels,
            'RMSE': [
                round(
                    math.sqrt(mean_squared_error(self.y_train,
                                                 pred_train_xgb)), 3),
                round(
                    math.sqrt(mean_squared_error(self.y_train,
                                                 pred_train_lgb)), 3),
                round(
                    math.sqrt(mean_squared_error(self.y_train, pred_train_lr)),
                    3),
                round(
                    math.sqrt(
                        mean_squared_error(self.y_train,
                                           pred_train_GradBoost)), 3),
                round(
                    math.sqrt(
                        mean_squared_error(self.y_train,
                                           pred_train_RandForest)), 3)
            ],
            'MAPE (%)': [
                __get_mape(self.y_train, pred_train_xgb),
                __get_mape(self.y_train, pred_train_lgb),
                __get_mape(self.y_train, pred_train_lr),
                __get_mape(self.y_train, pred_train_GradBoost),
                __get_mape(self.y_train, pred_train_RandForest)
            ],
            'RMSE_pred_test': [
                round(
                    math.sqrt(
                        mean_squared_error(test_copy[[self.varPredict]],
                                           test_copy[['pred_xgb']])), 3),
                round(
                    math.sqrt(
                        mean_squared_error(test_copy[[self.varPredict]],
                                           test_copy[['pred_lgb']])), 3),
                round(
                    math.sqrt(
                        mean_squared_error(test_copy[[self.varPredict]],
                                           test_copy[['pred_lr']])), 3),
                round(
                    math.sqrt(
                        mean_squared_error(test_copy[[self.varPredict]],
                                           test_copy[['pred_GradBoost']])), 3),
                round(
                    math.sqrt(
                        mean_squared_error(test_copy[[self.varPredict]],
                                           test_copy[['pred_RandForest']])), 3)
            ],
            'MAPE_pred_test (%)': [
                __get_mape(test_copy[[self.varPredict]],
                           test_copy[['pred_xgb']]),
                __get_mape(test_copy[[self.varPredict]],
                           test_copy[['pred_lgb']]),
                __get_mape(test_copy[[self.varPredict]],
                           test_copy[['pred_lr']]),
                __get_mape(test_copy[[self.varPredict]],
                           test_copy[['pred_GradBoost']]),
                __get_mape(test_copy[[self.varPredict]],
                           test_copy[['pred_RandForest']])
            ]
        })

        if plotTest == True:
            rcParams['figure.figsize'] = 10, 8  # width 10, height 8
            ax = test_copy.plot(x='Date',
                                y=[self.varPredict] + varsPred +
                                ['pred_ensamble'],
                                style=['g-', 'y-', 'b-'],
                                grid=True)
            ax.legend(['test'] + varsPred + ['pred_ensamble'])
            ax.set_xlabel("Date")
            ax.set_ylabel("USD")
            ax.set_title("Zoom in to test set")

        fileSave = self.path + '/' + self.idYahoo + '/output/metricas/metrics_train_' + self.varPredict + '_ml.csv'
        dfMetricsTrainTest.to_csv(fileSave, sep=';', index=False)
        self.testPred, self.dfMetricsTrainTest, self.models = test_copy, dfMetricsTrainTest, models
        trainingModel_ml.__saveModels(self, models, self.scaler, self.path,
                                      self.idYahoo)
Beispiel #22
0
        model = GridSearchCV(SVR(kernel='rbf'), cv=5, param_grid={"C": c_param, "gamma": gamma_param})
        model_name = "SVR"
    elif selected_model == Model.KRR:
        model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": numpy.logspace(-2, 2, 5)})
        model_name = "KRR"
    elif selected_model == Model.REGRESSION_TREE:
        model = DecisionTreeRegressor(criterion="mse")
        model_name = "REGRESSION_TREE"
    elif selected_model == Model.RANDOM_FOREST:
        model = RandomForestRegressor(criterion="mse", n_estimators=20, min_samples_split=4, min_weight_fraction_leaf=0.01)
        model_name = "FOREST"
    elif selected_model == Model.EXTRA_TREE_REGRESSOR:
        model = ExtraTreesRegressor(criterion="mse")
        model_name = "EXTRA_TREE_REGRESSOR"
    elif selected_model == Model.GRADIENT_BOOSTING_REGRESSOR:
        model = GradientBoostingRegressor(loss="lad", n_estimators=200)
        model_name = "GRADIENT_BOOSTING_REGRESSOR"
    elif selected_model == Model.BAGGING_REGRESSOR:
        model = BaggingRegressor(oob_score=True)
        model_name = "BAGGING_REGRESSOR"
    elif selected_model == Model.ADABOOST_REGRESSOR:
        model = AdaBoostRegressor(loss="linear")
        model_name = "ADABOOST_REGRESSOR"
    else:
        Support.colored_print("No method selected!", "red")
        sys.exit(0)
    Support.colored_print("Method selected: " + model_name, "green")

    Support.colored_print("Training...", "green")
    t0 = time.time()
    model.fit(X[:train_size], y[:train_size])
Beispiel #23
0
print(features_40_percent_sale_price_corr)
model_eval_helper(features_40_percent_sale_price_corr, LinearRegression())

# Again, we should have scaled our data before we trained our linear regression model. But since we won't use linear regression models from now on, we will skip scaling the data.

# ### GradientBoostingRegressor
#
# Now we'll use a more GradientBoostingRegressor as a more sophisticated model. Ensemble methods like GradientBoostingRegressor usually perform extremely good in Kaggle competitions. Further, we don't have to worry about feature scaling and having too many features.

# In[92]:

from sklearn.ensemble import GradientBoostingRegressor

# In[93]:

reg = GradientBoostingRegressor(n_estimators=200, max_depth=2)
reg

# In[94]:

model_eval_helper(final_features, model=reg)

# In[95]:

df_importances = pd.DataFrame(reg.feature_importances_,
                              index=final_features,
                              columns=["Importance"])
df_importances.sort_values("Importance", ascending=False, inplace=True)

print(df_importances)
    'SVR': {
        'model': SVR(),
        'param': {
            'clf__C': [0.1, 1, 10, 100],
            'clf__gamma': [1, 0.1, 0.01, 0.001],
            'clf__kernel': ['rbf', 'poly', 'sigmoid'],
        },
    },

    #         'XGB':{ "model":XGBRegressor(),
    #               "param":{"clf__learning_rate": [0.05,1,5],'clf__n_estimators': [100,50],
    # #                        "clf__max_depth": [5,10,15]
    #                   },
    #             },
    'GradientBoost': {
        "model": GradientBoostingRegressor(),
        "param": {
            "clf__model__n_estimators": [500, 600, 700, 800, 1000],
            #                         "clf__max_depth": [2, 3, 4]
        },
    },
    'decisionTree': {
        "model": GradientBoostingRegressor(),
        "param": {
            "clf__criterion": ['mse', 'mae'],
            'clf__min_samples_leaf': [5, 10, 15, 20, 25],
            'clf__max_depth': [6, 9, 12, 15, 20],
        },
    },
}
                                 Y_train,
                                 cv=kfold,
                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# ensembles
ensembles = []
ensembles.append(('ScaledAB',
                  Pipeline([('Scaler', StandardScaler()),
                            ('AB', AdaBoostRegressor())])))
ensembles.append(('ScaledGBM',
                  Pipeline([('Scaler', StandardScaler()),
                            ('GBM', GradientBoostingRegressor())])))
ensembles.append(('ScaledRF',
                  Pipeline([('Scaler', StandardScaler()),
                            ('RF', RandomForestRegressor())])))
ensembles.append(('ScaledET',
                  Pipeline([('Scaler', StandardScaler()),
                            ('ET', ExtraTreesRegressor())])))
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring=scoring)
import sklearn
from random import shuffle
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.linear_model import LinearRegression

# Download data
tmp = fetch_california_housing()

num_samples = tmp['data'].shape[0]
feature_names = tmp['feature_names']
y = tmp['target']
X = tmp['data']

clf = GradientBoostingRegressor(loss="ls")
clf.fit(X,y)

plt.close("all")
plt.figure(figsize=[10,10])
ax = plt.gca()
plot_partial_dependence(clf, X, feature_names, feature_names, n_cols=3, ax=ax)
plt.tight_layout()
plt.show()

clf2 = LinearRegression()
clf2.fit(X,y)

MSE_boosting = np.mean((y-clf.predict(X))**2)
MSE_LR = np.mean((y-clf2.predict(X))**2)
dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]

X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42)
y_train = y_train.values.ravel()

models = []
models.append(('SVR', SVR()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('l', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('R', Ridge()))
models.append(('BR', BayesianRidge()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('RF', AdaBoostRegressor()))
models.append(('ET', ExtraTreesRegressor()))
models.append(('BgR', BaggingRegressor()))

scoring = 'neg_mean_squared_error'

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
Beispiel #28
0
 def gredient_boosting(self):
     model = GradientBoostingRegressor()
     return self.fiting_model(model)
data = data.drop(['origin','destination','train_type','train_class','fare'],1)
data = pd.concat([one_hot_encoding, data], axis=1)

data = data.astype(np.float)

X = data.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
                  ,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
                  39,40,41,42,43,44,46,47,48,49,50,51]]
Y = data.iloc[:, [45]]

#split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X.values, Y.values, test_size=.9, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train1, y_train1, test_size=.1, random_state=42)

#model
model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)
R2 = r2_score(y_test, y_pred)
print(R2)
score_train = model.score(X_train, y_train)
print(score_train)
score_test = model.score(X_test,y_test)
print(score_test)

#learning_curve
#thanks to scikit-learn for the https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
def plot_curve():
Beispiel #30
0
    else:
        break
selected_features_BE = cols
X = df[selected_features_BE]
X = df[["CRuns", "OrtCWalks", "CWalks"]]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=46)
gbm_cv_model_best_params_ = {
    'learning_rate': 0.01,
    'loss': 'lad',
    'max_depth': 5,
    'n_estimators': 500,
    'subsample': 0.5
}
gbm_tuned = GradientBoostingRegressor(**gbm_cv_model_best_params_).fit(
    X_train, y_train)
y_pred = gbm_tuned.predict(X_test)
gbm_final = np.sqrt(mean_squared_error(y_test, y_pred))
print(gbm_final)

import pickle

pickle.dump(gbm_tuned, open('regression_model.pkl', 'wb'))

print("Model Kaydedildi")