def est_HGB(est): hp = [{ 'warm_start': (False, True), 'max_depth': ( 1, 10, 100, None, ), 'min_samples_leaf': ( 2, 5, 10, ), 'loss': ( 'ls', 'lad', 'huber', 'quantile', ), 'max_leaf_nodes': ( 2, 10, 20, 30, 40, 50, 100, ), }] est = ensemble.HistGradientBoostingRegressor() return est, hp
def hist_gra(diamonds, test_s, type_i='dum', learn_rate=0.16, make_pred=True, verb=0): X = diamonds.drop(columns=['price']) if 'Unnamed: 0' in X.columns: X = X.drop(columns=['Unnamed: 0']) if 'level_0' in X.columns: X = X.drop(columns=['level_0']) y = diamonds['price'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_s) params = {'learning_rate': learn_rate, 'warm_start': True, 'verbose': verb} clf = ensemble.HistGradientBoostingRegressor(**params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) print("For the HistGradient Boosting Regressor the MSE is: %.4f" % mse) if make_pred: print('Generating submission file ...') clf.fit(X, y) X_test = pd.read_csv('output/diamonds_test_' + type_i + '.csv') X_test = X_test.reset_index().set_index('index') if 'Unnamed: 0' in X_test.columns: X_test = X_test.drop(columns=['Unnamed: 0']) if 'level_0' in X_test.columns: X_test = X_test.drop(columns=['level_0']) y_sub = clf.predict(X_test) y_sub = pd.DataFrame({ 'id': range(len(y_sub)), 'price': np.absolute(y_sub.astype(int)) }) y_sub.to_csv('output/pred_HG_' + type_i + '.csv', index=False) return mse
def comp_boosting(self): """Fit a component-wise boosting model, using the models: i. Linear Models ii. Splines iii. Trees Report the variables selection frequencies in all three cases and the regression coefficients for the first model.""" """Linear Models:""" import sklearn.ensemble as skle gbr = skle.GradientBoostingRegressor() mod1 = gbr.fit(self.Xtrain, self.ytrain) ypred = mod1.predict(self.Xtest) #... fit something. ytrue = np.array(self.ytest.values.tolist()) ypred = ypred.tolist() LM_boost_MSE = np.mean((ytrue - ypred)**2) """Splines:""" # SP_boost_MSE = np.mean((self.ytest - ypred)**2) """Trees:""" from sklearn.experimental import enable_hist_gradient_boosting trr = skle.HistGradientBoostingRegressor() mod3 = trr.fit(self.Xtrain, self.ytrain) ypred = mod3.predict(self.Xtest) ytest = np.array(self.ytest.values.tolist()) ypred = ypred.tolist() TR_boost_MSE = np.mean((ytest - ypred)**2) """Save these values for Exercise 7.""" self.BST1E1P6 = LM_boost_MSE self.BST2E1P6 = 0 # SP_boost_MSE self.BST3E1P6 = TR_boost_MSE """Report the variables selection frequencies in all three cases and the regression coefficients for the first model.""" return 1
dump(sc_x, open('scaler_x_shear.pkl', 'wb')) dump(sc_y, open('scaler_y_shear.pkl', 'wb')) print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) hyper_params = [{'warm_start': (True, False,), 'max_depth': (None,), 'min_samples_leaf': (1, 5, 10, 15, 20, 25, 50, 100,), 'loss': ('least_squares', 'least_absolute_deviation', 'poisson',), 'max_leaf_nodes' : (2, 10, 20, 30, 40, 50, 100,), }] est=ensemble.HistGradientBoostingRegressor() gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2') t0 = time.time() gs.fit(x_train, y_train.ravel()) runtime = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime) train_score_mse = mean_squared_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_mae = mean_absolute_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_me = max_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_r2 = r2_score( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) test_score_mse = mean_squared_error( sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_mae = mean_absolute_error( sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
def gradient_boosting(df_train, df_test, target, features): #st.header("Gradient Boosting (Histogram-based)") st.header("Results") t_start = time.time() with st.spinner("Training in progress..."): X_train = df_train[features] y_train = df_train[target] model = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('gboost', ensemble.HistGradientBoostingRegressor(max_iter=200)) ]).fit(X_train, y_train) t_train = time.time() - t_start t_start = time.time() with st.spinner("Testing in progress...."): X_test = df_test[features] y_test = df_test[target] y_test_pred = model.predict(X_test) mse = metrics.mean_squared_error(y_test_pred, y_test) r2 = metrics.r2_score(y_test_pred, y_test) t_test = time.time() - t_start st.write(f"**MSE**: {mse:.2f}") st.write(f"**R^2 Score**: {r2:.3f}") st.write(f"_Time train: {t_train:.3f} s, test: {t_test:.3f} s_") ax = sns.jointplot(y_test, y_test_pred, alpha=0.1, s=1.0, color="black") #sns.jointplot(y_test, y_test_pred, kind="hex") #sns.jointplot(y_test, y_test_pred, kind="reg") ax.set_axis_labels("Actual Pelvis Moment", 'Predicted Pelvis Moment') #ax.ax_joint.legend_.remove() ax.ax_joint.grid() #ax.ax_marg_x.set_title(f"Correlation of actual vs. predicted moments") st.pyplot() st.sidebar.header("Feature importances") do_imps = st.sidebar.checkbox("Compute Feature Importance", value=False) if do_imps: n_repeats = st.sidebar.number_input(label="Repeats", value=10, min_value=1) st.header("Feature importances") st.write("The permutation feature importance is defined to be the " "decrease in a model score when a single feature value is " "randomly shuffled.") with st.spinner("Computing permutation importance...."): imps = inspection.permutation_importance(model, X_test, y_test, n_repeats=n_repeats, random_state=42, n_jobs=-1) sorted_imps_idx = imps.importances_mean.argsort() fig, ax = plt.subplots() ax.boxplot(imps.importances[sorted_imps_idx].T, vert=False, showfliers=False, labels=X_test.columns[sorted_imps_idx]) fig.tight_layout() st.pyplot()
print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) # https://stackoverflow.com/questions/43532811/gridsearch-over-multioutputregressor/52562463 # https://coderoad.ru/43532811/GridSearch-%D0%B7%D0%B0-MultiOutputRegressor hyper_params = [{'estimator__warm_start': (True, False,), 'estimator__max_depth': (None,), 'estimator__min_samples_leaf': (1, 5, 10, 15, 20, 25, 50, 100,), 'estimator__loss': ('least_squares', 'least_absolute_deviation', 'poisson',), 'estimator__max_leaf_nodes' : (2, 10, 20, 30, 40, 50, 100,), }] est=ensemble.HistGradientBoostingRegressor(random_state=69) gs = GridSearchCV(MultiOutputRegressor(est), cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True) t0 = time.time() gs.fit(x_train, y_train) runtime = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime) train_score_mse = mean_squared_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_mae = mean_absolute_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) #train_score_me = max_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_r2 = r2_score( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) test_score_mse = mean_squared_error( sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_mae = mean_absolute_error( sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))