def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): max_iter = 200 X, y = make_regression(random_state=0) gb = HistGradientBoostingRegressor( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0 ) gb.fit(X, y) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
best_loss = gs.best_params_['loss'] outF = open("output.txt", "w") print('best_warm_start = ', best_warm_start, file=outF) print('best_max_depth = ', best_max_depth, file=outF) print('best_loss = ', best_loss, file=outF) outF.close() regr = HistGradientBoostingRegressor( warm_start=best_warm_start, max_depth=best_max_depth, loss=best_loss, ) t0 = time.time() regr.fit(x_train, y_train.ravel()) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) # open a file to append outF = open("output.txt", "a") print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit, file=outF) print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict),
def test_poisson_y_positive(y): # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0. err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0." gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0) with pytest.raises(ValueError, match=err_msg): gbdt.fit(np.zeros(shape=(len(y), 1)), y)
display.figure_.subplots_adjust(hspace=0.3) # %% # Gradient boosting # ................. # # Let's now fit a :class:`sklearn.ensemble.HistGradientBoostingRegressor` and # compute the partial dependence on the same features. from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor print("Training HistGradientBoostingRegressor...") tic = time() est = HistGradientBoostingRegressor() est.fit(X_train, y_train) print(f"done in {time() - tic:.3f}s") print(f"Test R2 score: {est.score(X_test, y_test):.2f}") # %% # Here, we used the default hyperparameters for the gradient boosting model # without any preprocessing as tree-based models are naturally robust to # monotonic transformations of numerical features. # # Note that on this tabular dataset, Gradient Boosting Machines are both # significantly faster to train and more accurate than neural networks. It is # also significantly cheaper to tune their hyperparameters (the defaults tend # to work well while this is not often the case for neural networks). # # We will plot the partial dependence, both individual (ICE) and averaged one # (PDP). We limit to only 50 ICE curves to not overcrowd the plot.
def find_best_params_histGradientBoosting(X_train, y_train, X_val, y_val, param1_list, param2_list, param3_list, param4_list): param1_name = "max_iter" param2_name = "max_leaf_nodes" param3_name = "max_depth" param4_name = "max_bins" print("mean_absolute_error [{0},{1}]: mean absolute error".format( param1_name, param2_name)) scores_mean_absolute_error = np.zeros((len(param1_list), len(param2_list))) best_score_mean_absolute_error = float("inf") best_parameters = [0, 0] for i in range(len(param1_list)): for k in range(len(param2_list)): for t in range(len(param3_list)): for j in range(len(param4_list)): param1 = param1_list[i] param2 = param2_list[k] param3 = param3_list[t] param4 = param4_list[j] regressor = HistGradientBoostingRegressor( loss='least_squares', learning_rate=0.05, max_iter=param1, max_leaf_nodes=param2, max_depth=param3, min_samples_leaf=20, l2_regularization=0.6, max_bins=param4, scoring=None, validation_fraction=0.6, n_iter_no_change=None, tol=1e-07, verbose=0, random_state=0) regressor.fit(X_train, y_train) predictions = regressor.predict(X_val) mean_absolute_error_ = mean_absolute_error( y_val, predictions) print("mean_absolute_error [{0},{1},{2},{3}]: {4}".format( param1, param2, param3, param4, mean_absolute_error_)) scores_mean_absolute_error[i, j] = mean_absolute_error_ if mean_absolute_error_ < best_score_mean_absolute_error: best_score_mean_absolute_error = mean_absolute_error_ best_parameters = [param1, param2, param3, param4] print("\n-------------------------") print( "best_mean_absolute_error: {0}".format(best_score_mean_absolute_error)) print("{0}: {1}".format(param1_name, best_parameters[0])) print("{0}: {1}".format(param2_name, best_parameters[1])) print("{0}: {1}".format(param3_name, best_parameters[2])) print("{0}: {1}".format(param4_name, best_parameters[3])) print("-------------------------\n") best_param1 = best_parameters[0] best_param2 = best_parameters[1] best_param3 = best_parameters[2] best_param4 = best_parameters[3] return best_param1, best_param2, best_param3, best_param4, scores_mean_absolute_error
def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # # Each feature (containing NaNs) is replaced by 2 features: # - one where the nans are replaced by min(feature) - 1 # - one where the nans are replaced by max(feature) + 1 # A split where nans go to the left has an equivalent split in the # first (min) feature, and a split where nans go to the right has an # equivalent split in the second (max) feature. # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). # # The MinMaxImputer transformer is meant to be a toy implementation of the # "Missing In Attributes" (MIA) missing value handling for decision trees # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 # The implementation of MIA as an imputation transformer was suggested by # "Remark 3" in https://arxiv.org/abs/1902.06931 class MinMaxImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): mm = MinMaxScaler().fit(X) self.data_min_ = mm.data_min_ self.data_max_ = mm.data_max_ return self def transform(self, X): X_min, X_max = X.copy(), X.copy() for feature_idx in range(X.shape[1]): nan_mask = np.isnan(X[:, feature_idx]) X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 return np.concatenate([X_min, X_max], axis=1) def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng) # n_samples need to be large enough to minimize the likelihood of having # several candidate splits with the same gain value in a given tree. X_train, X_test, y_train, y_test = make_missing_value_data( n_samples=int(1e4), seed=0) # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) gbm1.fit(X_train, y_train) gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) gbm2.fit(X_train, y_train) # Check that the model reach the same score: assert gbm1.score(X_train, y_train) == \ pytest.approx(gbm2.score(X_train, y_train)) assert gbm1.score(X_test, y_test) == \ pytest.approx(gbm2.score(X_test, y_test)) # Check the individual prediction match as a finer grained # decision function check. assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
print('MAE is {}'.format(test_score_mae)) print('MSE is {}'.format(test_score_mse)) print('EVS is {}'.format(test_score_evs)) #print('ME is {}'.format(test_score_me)) print('R2 score is {}'.format(test_score_r2)) print() print("Best parameters set found on development set:") print(gs.best_params_) print() # Re-train with best parameters regr = HistGradientBoostingRegressor(**gs.best_params_, random_state=69) regr = MultiOutputRegressor(regr) t0 = time.time() regr.fit(x_train, y_train) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) with open('output.log', 'w') as f: print("Training time: %.6f s" % regr_fit, file=f) print("Prediction time: %.6f s" % regr_predict, file=f) print(" ", file=f) print("The model performance for training set", file=f) print("--------------------------------------", file=f) print('MAE is {}'.format(train_score_mae), file=f)
y_train_df = augmented_df[["actual_adr"]] X_train_df = augmented_df.drop(["actual_adr"], axis=1) #%% X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["actual_adr"].to_numpy(), y_test_df["actual_adr"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1126) eval_reg.fit(X_train, y_train) print("-" * 10, "regression report", "-" * 10) report = regression_report(y_test, eval_reg.predict(X_test), X_test.shape[1]) print(report) print("-" * 10, "evaluation of label", "-" * 10) label_df = data.get_true_label( columns=["adr", "revenue", "is_canceled", "label"]) pred_label_df = data.predict_label(eval_reg, X_test_df, reg_out="adr") #%% print("[ label evaluation ]") report_label = evaluate_by_label(pred_label_df, label_df, target="label") print(report_label) print("[ revenue_per_day evaluation ]")
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. # - We don't check the least_absolute_deviation loss here. This is because # LightGBM's computation of the median (used for the initial value of # raw_prediction) is a bit off (they'll e.g. return midpoints when there # is no need to.). Since these tests only run 1 iteration, the # discrepancy between the initial values leads to biggish differences in # the predictions. These differences are much smaller with more # iterations. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) print(gbdt.score(X_test, y_test)) ############################################################################## # Rich visual representation of estimators # ----------------------------------------- # Estimators can now be visualized in notebooks by enabling the # `display='diagram'` option. This is particularly useful to summarise the # structure of pipelines and other composite estimators, with interactivity to # provide detail. Click on the example image below to expand Pipeline # elements. See :ref:`visualizing_composite_estimators` for how you can use # this feature. from sklearn import set_config from sklearn.pipeline import make_pipeline