Exemple #1
0
def est_HGB(est):
    hp = [{
        'warm_start': (False, True),
        'max_depth': (
            1,
            10,
            100,
            None,
        ),
        'min_samples_leaf': (
            2,
            5,
            10,
        ),
        'loss': (
            'ls',
            'lad',
            'huber',
            'quantile',
        ),
        'max_leaf_nodes': (
            2,
            10,
            20,
            30,
            40,
            50,
            100,
        ),
    }]
    est = ensemble.HistGradientBoostingRegressor()
    return est, hp
Exemple #2
0
def hist_gra(diamonds,
             test_s,
             type_i='dum',
             learn_rate=0.16,
             make_pred=True,
             verb=0):
    X = diamonds.drop(columns=['price'])
    if 'Unnamed: 0' in X.columns:
        X = X.drop(columns=['Unnamed: 0'])
    if 'level_0' in X.columns:
        X = X.drop(columns=['level_0'])
    y = diamonds['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_s)
    params = {'learning_rate': learn_rate, 'warm_start': True, 'verbose': verb}

    clf = ensemble.HistGradientBoostingRegressor(**params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    print("For the HistGradient Boosting Regressor the MSE is: %.4f" % mse)

    if make_pred:
        print('Generating submission file ...')
        clf.fit(X, y)
        X_test = pd.read_csv('output/diamonds_test_' + type_i + '.csv')
        X_test = X_test.reset_index().set_index('index')
        if 'Unnamed: 0' in X_test.columns:
            X_test = X_test.drop(columns=['Unnamed: 0'])
        if 'level_0' in X_test.columns:
            X_test = X_test.drop(columns=['level_0'])

        y_sub = clf.predict(X_test)
        y_sub = pd.DataFrame({
            'id': range(len(y_sub)),
            'price': np.absolute(y_sub.astype(int))
        })
        y_sub.to_csv('output/pred_HG_' + type_i + '.csv', index=False)

    return mse
Exemple #3
0
    def comp_boosting(self):
        """Fit a component-wise boosting model, using the models:
            i.      Linear Models
            ii.     Splines
            iii.    Trees
        Report the variables selection frequencies in all three
        cases and the regression coefficients for the first model."""
        """Linear Models:"""
        import sklearn.ensemble as skle
        gbr = skle.GradientBoostingRegressor()
        mod1 = gbr.fit(self.Xtrain, self.ytrain)
        ypred = mod1.predict(self.Xtest)  #... fit something.

        ytrue = np.array(self.ytest.values.tolist())
        ypred = ypred.tolist()

        LM_boost_MSE = np.mean((ytrue - ypred)**2)
        """Splines:"""

        # SP_boost_MSE = np.mean((self.ytest - ypred)**2)
        """Trees:"""
        from sklearn.experimental import enable_hist_gradient_boosting
        trr = skle.HistGradientBoostingRegressor()
        mod3 = trr.fit(self.Xtrain, self.ytrain)
        ypred = mod3.predict(self.Xtest)

        ytest = np.array(self.ytest.values.tolist())
        ypred = ypred.tolist()

        TR_boost_MSE = np.mean((ytest - ypred)**2)
        """Save these values for Exercise 7."""
        self.BST1E1P6 = LM_boost_MSE
        self.BST2E1P6 = 0  # SP_boost_MSE
        self.BST3E1P6 = TR_boost_MSE
        """Report the variables selection frequencies in all three cases and
        the regression coefficients for the first model."""

        return 1
Exemple #4
0
dump(sc_x, open('scaler_x_shear.pkl', 'wb'))
dump(sc_y, open('scaler_y_shear.pkl', 'wb'))

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

hyper_params = [{'warm_start': (True, False,),
                 'max_depth': (None,),
                 'min_samples_leaf': (1, 5, 10, 15, 20, 25, 50, 100,),
                 'loss': ('least_squares', 'least_absolute_deviation', 'poisson',),
                 'max_leaf_nodes' : (2, 10, 20, 30, 40, 50, 100,),
}]

est=ensemble.HistGradientBoostingRegressor()
gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2')

t0 = time.time()
gs.fit(x_train, y_train.ravel())
runtime = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime)

train_score_mse = mean_squared_error(      sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_mae = mean_absolute_error(     sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_me  = max_error(               sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_r2  = r2_score(                sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))

test_score_mse  = mean_squared_error(      sc_y.inverse_transform(y_test),  sc_y.inverse_transform(gs.predict(x_test)))
test_score_mae  = mean_absolute_error(     sc_y.inverse_transform(y_test),  sc_y.inverse_transform(gs.predict(x_test)))
Exemple #5
0
def gradient_boosting(df_train, df_test, target, features):
    #st.header("Gradient Boosting (Histogram-based)")
    st.header("Results")
    t_start = time.time()
    with st.spinner("Training in progress..."):
        X_train = df_train[features]
        y_train = df_train[target]

        model = pipeline.Pipeline([
            ('scaler', preprocessing.StandardScaler()),
            ('gboost', ensemble.HistGradientBoostingRegressor(max_iter=200))
        ]).fit(X_train, y_train)
        t_train = time.time() - t_start

    t_start = time.time()
    with st.spinner("Testing in progress...."):
        X_test = df_test[features]
        y_test = df_test[target]
        y_test_pred = model.predict(X_test)
        mse = metrics.mean_squared_error(y_test_pred, y_test)
        r2 = metrics.r2_score(y_test_pred, y_test)
        t_test = time.time() - t_start

    st.write(f"**MSE**: {mse:.2f}")
    st.write(f"**R^2 Score**: {r2:.3f}")
    st.write(f"_Time train: {t_train:.3f} s, test: {t_test:.3f} s_")

    ax = sns.jointplot(y_test, y_test_pred, alpha=0.1, s=1.0, color="black")
    #sns.jointplot(y_test, y_test_pred, kind="hex")
    #sns.jointplot(y_test, y_test_pred, kind="reg")
    ax.set_axis_labels("Actual Pelvis Moment", 'Predicted Pelvis Moment')
    #ax.ax_joint.legend_.remove()
    ax.ax_joint.grid()
    #ax.ax_marg_x.set_title(f"Correlation of actual vs. predicted moments")
    st.pyplot()

    st.sidebar.header("Feature importances")
    do_imps = st.sidebar.checkbox("Compute Feature Importance", value=False)

    if do_imps:
        n_repeats = st.sidebar.number_input(label="Repeats",
                                            value=10,
                                            min_value=1)

        st.header("Feature importances")
        st.write("The permutation feature importance is defined to be the "
                 "decrease in a model score when a single feature value is "
                 "randomly shuffled.")

        with st.spinner("Computing permutation importance...."):
            imps = inspection.permutation_importance(model,
                                                     X_test,
                                                     y_test,
                                                     n_repeats=n_repeats,
                                                     random_state=42,
                                                     n_jobs=-1)
            sorted_imps_idx = imps.importances_mean.argsort()

        fig, ax = plt.subplots()
        ax.boxplot(imps.importances[sorted_imps_idx].T,
                   vert=False,
                   showfliers=False,
                   labels=X_test.columns[sorted_imps_idx])
        fig.tight_layout()
        st.pyplot()
Exemple #6
0
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:',   y_train.shape)
print('Testing Features Shape:',  x_test.shape)
print('Testing Labels Shape:',    y_test.shape)

# https://stackoverflow.com/questions/43532811/gridsearch-over-multioutputregressor/52562463
# https://coderoad.ru/43532811/GridSearch-%D0%B7%D0%B0-MultiOutputRegressor
hyper_params = [{'estimator__warm_start': (True, False,),
                 'estimator__max_depth': (None,),
                 'estimator__min_samples_leaf': (1, 5, 10, 15, 20, 25, 50, 100,),
                 'estimator__loss': ('least_squares', 'least_absolute_deviation', 'poisson',),
                 'estimator__max_leaf_nodes' : (2, 10, 20, 30, 40, 50, 100,),
}]

est=ensemble.HistGradientBoostingRegressor(random_state=69)
gs = GridSearchCV(MultiOutputRegressor(est), cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True)

t0 = time.time()
gs.fit(x_train, y_train)
runtime = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime)

train_score_mse = mean_squared_error(      sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_mae = mean_absolute_error(     sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
#train_score_me = max_error(               sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
train_score_r2  = r2_score(                sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))

test_score_mse  = mean_squared_error(      sc_y.inverse_transform(y_test),  sc_y.inverse_transform(gs.predict(x_test)))
test_score_mae  = mean_absolute_error(     sc_y.inverse_transform(y_test),  sc_y.inverse_transform(gs.predict(x_test)))