def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_HistGradientBoostingRegressor(): from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor # train a tree-based model X, y = shap.datasets.diabetes() model = HistGradientBoostingRegressor(max_iter=1000, max_depth=6).fit(X, y) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) assert np.max( np.abs( shap_values.sum(1) + explainer.expected_value - model.predict(X))) < 1e-4
def onnxrt_python_RandomForestRegressor_dtype( self, dtype, n=37, full=False, use_hist=False, ntrees=10, runtime='python'): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split( X, y, random_state=11 if not full else 13) X_test = X_test.astype(dtype) if use_hist: if full: clr = HistGradientBoostingRegressor() else: clr = HistGradientBoostingRegressor( max_iter=ntrees, max_depth=4) else: if full: clr = RandomForestRegressor(n_jobs=1) else: clr = RandomForestRegressor( n_estimators=ntrees, n_jobs=1, max_depth=4) clr.fit(X_train, y_train) model_def = to_onnx(clr, X_train.astype(dtype), rewrite_ops=True) oinf = OnnxInference(model_def) text = "\n".join(map(lambda x: str(x.ops_), oinf.sequence_)) self.assertIn("TreeEnsembleRegressor", text) if full: n = 34 X_test = X_test[n:n + 5] else: n = 37 X_test = X_test[n:n + 5] X_test = numpy.vstack([X_test, X_test[:1].copy() * 1.01, X_test[:1].copy() * 0.99]) y = oinf.run({'X': X_test}) self.assertEqual(list(sorted(y)), ['variable']) lexp = clr.predict(X_test) if dtype == numpy.float32: self.assertEqualArray(lexp, y['variable'], decimal=5) else: try: self.assertEqualArray(lexp, y['variable']) except AssertionError as e: raise AssertionError( "---------\n{}\n-----".format(model_def)) from e self.assertEqual(oinf.sequence_[0].ops_.rt_.same_mode_, True) self.assertNotEmpty(oinf.sequence_[0].ops_.rt_.nodes_modes_)
def test_zero_sample_weights_regression(): # Make sure setting a SW to zero amounts to ignoring the corresponding # sample X = [[1, 0], [1, 0], [1, 0], [0, 1]] y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] gb = HistGradientBoostingRegressor(min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert gb.predict([[1, 0]])[0] > 0.5
def GBM(X_train, X_test, y_train, y_test, loss, mode): parameters = { 'max_depth': 40, 'min_samples_leaf': 1, 'learning_rate': 0.01, 'loss': loss } GradientBoostingRegressorObject = HistGradientBoostingRegressor( random_state=1, **parameters) if mode == 'val': X = X_train.append(X_test) y = np.append(y_train, y_test) y_pred = cross_val_predict(GradientBoostingRegressorObject, X, y, cv=5) y_prediction = y_pred[-len(y_test):] y_prediction_train = y_pred[:len(y_train)] if mode == 'test': GradientBoostingRegressorObject.fit(X_train, y_train) y_prediction = GradientBoostingRegressorObject.predict(X_test) y_prediction_train = GradientBoostingRegressorObject.predict(X_train) return y_prediction, y_prediction_train
def forecast_hgbr(df, forecast, day, seed, num_epochs, all): # don't try to predict pv at night! # ( create the model only using this zenith, but forecast all # points when making the prediction as the forecast day may # have different zenith, and hence different values ) day_df = df[df['zenith'] < 87] # set up inputs input_df = ann_inputs(day_df, all) # set up output output_column = 'pv_power' output = day_df[output_column] X_train = input_df.values y_train = output.values.reshape(len(output), 1) print('Creating Regressor ...') # loss='ls' or 'lad' reg = HistGradientBoostingRegressor(max_iter=500) print('Fitting model ...') reg.fit(X_train, y_train) print('Making prediction ...') # although the model is only trained on day time ( zenith>87) # we forecast the whole day as it doesn't do any harm. forecast_day = forecast.loc[day.strftime('%Y-%m-%d')].copy() input_f = ann_inputs(forecast_day, all) X_test = input_f.values # prediction sk_pred = reg.predict(X_test) print('Prediction completed ...') forecast_day['prediction'] = sk_pred forecast_day.loc[forecast_day['zenith'] > 87, 'prediction'] = 0.0 forecast.loc[day.strftime('%Y-%m-%d'), 'prediction'] = forecast_day['prediction']
def q3_main(df, df_train): # df_train = pd.read_csv('./timeseries_train.csv') # df_train = df.dropna() x_train = df_train[df_train.columns[3:15]] y_train = df_train['WQI'] x_test = df_train[df_train.columns[3:15]] sc_X = StandardScaler() x_train = sc_X.fit_transform(x_train) x_test = sc_X.transform(x_test) regressor_gb = HistGradientBoostingRegressor() y_pred = np.array([]) regressor_gb.fit(x_train, y_train) y_pred = regressor_gb.predict(x_test) # print(y_pred) df_val = pd.DataFrame({'Predicted WQI': y_pred}) df['WQI'] = df_val # print(df) return df
model.fit(X, y) ### Machine learning x2_all = np.arange(34) y2_all = np.zeros_like(x2_all) results2 = {} for v in variants: idx = np.where(dirty_jt == v)[0] n_emp = len(idx) xc = X[idx[0]] unq_sen = np.arange(34) mean_salary = [] for k, sen in enumerate(unq_sen): xc[300] = sen mean_salary.append(model.predict(xc[np.newaxis, :])[0]) results2[v] = unq_sen, np.array(mean_salary) y2_all = y2_all + np.array(mean_salary) y2_all /= len(variants) variants2 = [ '0361 project manager', '2128 project manager', '9109 project manager', 'manager project', 'mgr project', 'project manager' ] ### Plug-in estimates results3 = {} for v in variants2: m1 = (dirty_jt == v) seniority_cat, y_cat = Xs[m1, 300], salary[m1]
learning_rate=0.1, loss='least_squares', # max_leaf_nodes=51, # min_samples_leaf=20, # l2_regularization=200, verbose=0 ) model.fit(train_X, train_Y) print("models fit") print("train_X length: ", len(train_X)) print("test_X length: ", len(test_X)) print("train MIN:{0} MAX:{1}".format(str(min(train_Y)), str(max(train_Y)))) print("test MIN:{0} MAX:{1}".format(str(min(test_Y)), str(max(test_Y)))) start = time.time() pred_Y = model.predict(test_X) end = time.time() T_i += end - start R2_score_avg[k_i - 1] = r2_score(test_Y, pred_Y) mae_avg[k_i - 1] = mean_absolute_error(test_Y, pred_Y) mse_avg[k_i - 1] = mean_squared_error(test_Y, pred_Y) mape_avg[k_i - 1] = np.mean(np.abs((pred_Y - test_Y) / test_Y)) * 100 my_metric_avg[k_i - 1] = 0 temp_min = 0 temp_min_i = 0 for y_i in range(len(test_Y)): if test_Y[y_i] == 0: continue if np.abs(test_Y[y_i] - pred_Y[y_i]) / test_Y[y_i] > temp_min: temp_min = np.abs(test_Y[y_i] - pred_Y[y_i]) / test_Y[y_i] temp_min_i = y_i
def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # # Each feature (containing NaNs) is replaced by 2 features: # - one where the nans are replaced by min(feature) - 1 # - one where the nans are replaced by max(feature) + 1 # A split where nans go to the left has an equivalent split in the # first (min) feature, and a split where nans go to the right has an # equivalent split in the second (max) feature. # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). # # The MinMaxImputer transformer is meant to be a toy implementation of the # "Missing In Attributes" (MIA) missing value handling for decision trees # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 # The implementation of MIA as an imputation transformer was suggested by # "Remark 3" in :arxiv:'<1902.06931>` class MinMaxImputer(TransformerMixin, BaseEstimator): def fit(self, X, y=None): mm = MinMaxScaler().fit(X) self.data_min_ = mm.data_min_ self.data_max_ = mm.data_max_ return self def transform(self, X): X_min, X_max = X.copy(), X.copy() for feature_idx in range(X.shape[1]): nan_mask = np.isnan(X[:, feature_idx]) X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 return np.concatenate([X_min, X_max], axis=1) def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng) # n_samples need to be large enough to minimize the likelihood of having # several candidate splits with the same gain value in a given tree. X_train, X_test, y_train, y_test = make_missing_value_data( n_samples=int(1e4), seed=0) # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) gbm1.fit(X_train, y_train) gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) gbm2.fit(X_train, y_train) # Check that the model reach the same score: assert gbm1.score(X_train, y_train) == pytest.approx(gbm2.score(X_train, y_train)) assert gbm1.score(X_test, y_test) == pytest.approx(gbm2.score(X_test, y_test)) # Check the individual prediction match as a finer grained # decision function check. assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
print('R2 score is {}'.format(test_score_r2)) print() print("Best parameters set found on development set:") print(gs.best_params_) print() # Re-train with best parameters regr = HistGradientBoostingRegressor(**gs.best_params_) t0 = time.time() regr.fit(x_train, y_train.ravel()) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) with open('output.log', 'w') as f: print("Training time: %.6f s" % regr_fit, file=f) print("Prediction time: %.6f s" % regr_predict, file=f) print(" ", file=f) print("The model performance for training set", file=f) print("--------------------------------------", file=f) print('MAE is {}'.format(train_score_mae), file=f) print('MSE is {}'.format(train_score_mse), file=f) print('EVS is {}'.format(train_score_evs), file=f) print('ME is {}'.format(train_score_me), file=f) print('R2 score is {}'.format(train_score_r2), file=f) print(" ", file=f)
#---------------------------------------------------------------------- # Treinar e testar um regressor HistGradientBoosting #---------------------------------------------------------------------- print(' ') print(' REGRESSOR HIST GRADIENT BOOSTING:') print(' ') hgb = HistGradientBoostingRegressor(l2_regularization=12.0, max_iter=70, learning_rate=0.1, loss='least_absolute_deviation') hgb = hgb.fit(x_treino, y_treino) y_resposta_treino = hgb.predict(x_treino) y_resposta_teste = hgb.predict(x_teste) print(' Métrica DENTRO da amostra FORA da amostra ') print(' ------- ----------------- --------------- ') mse_in = mean_squared_error(y_treino, y_resposta_treino) rmse_in = math.sqrt(mse_in) r2_in = r2_score(y_treino, y_resposta_treino) medae_in = median_absolute_error(y_treino, y_resposta_treino) msle_in = mean_squared_log_error(y_treino, y_resposta_treino) rmspe_in = rmspe(y_treino, y_resposta_treino) mse_out = mean_squared_error(y_teste, y_resposta_teste) rmse_out = math.sqrt(mse_out) r2_out = r2_score(y_teste, y_resposta_teste)
tree_method='hist', ) model.fit(features_train, labels_train) elif args.library == 'catboost': from catboost import CatBoostRegressor model = CatBoostRegressor(grow_policy='Lossguide', learning_rate=0.1, n_estimators=100, num_leaves=255, train_dir='data/catboost_info', verbose=False) model.fit(features_train, labels_train, silent=True) # Make predictions on the test data. if args.library == 'h2o': predictions = model.predict(data_test).as_data_frame() else: predictions = model.predict(features_test) # Compute metrics. mse = mean_squared_error(predictions, labels_test) # Compute memory usage. f = open("/proc/self/status", "r") for line in f.readlines(): if line.startswith("VmHWM"): memory = line.split(":")[1].strip() print(json.dumps({ 'mse': mse, 'memory': memory,
X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["adr"].to_numpy(), y_test_df["adr"].to_numpy(), y_train_df["is_canceled"].to_numpy(), y_test_df["is_canceled"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train_adr.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test_adr.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1126) eval_reg.fit(X_train.copy(), y_train_adr.copy()) print("-" * 10, "regression report", "-" * 10) report = regression_report(y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1]) print(report) eval_clf = RandomForestClassifier(n_estimators=150, max_depth=50, random_state=1126) # eval_clf = HistGradientBoostingClassifier(random_state=1129) eval_clf.fit(X_train.copy(), y_train_canceled.copy()) print("-" * 10, "classification report", "-" * 10) report = classification_report(y_test_canceled.copy(), eval_clf.predict(X_test.copy())) print(report) #%% pred_df = predict(eval_clf, eval_reg, X_test_df)
def test_pipeline(target_options, data_options, model_options, out_options, plot_options): # target_options: function + function domain + function representation + function short name # data options: data length # model options: bins + patience + early stopping delta + ... # out options: verbose # generate dataset x_train, y_train, x_valid, y_valid = TestHelper.__generate_data_uniform( target_options, data_options) # print experiment conditions if out_options['verbose'] >= 3: TestHelper.__print_conditions(model_options, data_options, x_train.shape[1]) # fit model def fit_wrapper(): model = regbm.Boosting(model_options['min_bins'], model_options['max_bins'], model_options['patience'], False, THREAD_COUNT) start_time = time.time( ) # get start time to count the time of execution history = model.fit( x_train, y_train, x_valid, y_valid, model_options['tree_count'], model_options['tree_depth'], model_options['feature_fold_size'], model_options['learning_rate'], model_options['reg'], model_options['es_delta'], model_options['batch_part'], model_options['random_batches'], model_options['random_hist_thresholds'], model_options['remove_regularization_later']) exec_time = time.time() - start_time if out_options['verbose'] >= 1: print(f"Fit time = {exec_time} seconds") return model, history history = None model = None model, history = fit_wrapper() if out_options['verbose'] >= 1: print(f"Model fit finished") if out_options['verbose'] >= 3: print(f"Real tree count: {history.trees_number()}") # fit Sklearn model to compare with if out_options["sklearn"]: sk_model = HistGradientBoostingRegressor( learning_rate=model_options['learning_rate'], max_depth=model_options['tree_depth'], max_iter=model_options['tree_count']) sk_model.fit(x_train, y_train) # evaluate both models preds = model.predict(x_valid) if out_options['verbose'] >= 2: print("Evaluation:") model_mae = mae(y_valid, preds) print(f"regbm model MAE: {model_mae}") if out_options['sklearn'] and out_options['verbose'] >= 2: sk_preds = sk_model.predict(x_valid) sklearn_mae = mae(y_valid, sk_preds) print(f"Sklearn model MAE: {sklearn_mae}") print(f"Sklearn better {model_mae / sklearn_mae} times") # make plots if plot_options['need_plots']: model_name = TestHelper.__get_model_name(target_options, model_options) TestHelper.__plot_losses(history, model_name + "_loss") TestHelper.__plot_predictions( target_options, model, plot_options, model_name + "_pred", sk_model if out_options["sklearn"] else None)
# preprocess the data X_train, X_val, X_test, y_train, y_val, y_test = preprocess_serial() print("X_train.shape, y_train.shape") print(X_train.shape, y_train.shape) print("X_test.shape, y_test.shape") print(X_test.shape, y_test.shape) # create model using the best hyperparameters found in the parallel implementation model = HistGradientBoostingRegressor(learning_rate=0.5, max_depth=8) # fit data start = time() model.fit(X_train, y_train) dt = time() - start print("Time to fit: %f" % dt) # predict data y_pred = model.predict(X_test) # calculate stats mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) print(f"MSE: {mse}, MAE: {mae}") # save predictions to csv pred_df = pd.DataFrame(y_pred, columns=['Weighted_Price']) pred_df.to_csv('../data/predictions/gboost_sklearn_y_pred.csv', index=False) print("Done")
from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor hist = HistGradientBoostingRegressor(random_state=42) hist.fit(X_train, y_train) hist_pred = hist.predict(X_test) compute_metrics(y_test, hist_pred) hist_poisson = HistGradientBoostingRegressor(loss='poisson', random_state=42) hist_poisson.fit(X_train, y_train) hist_poisson_pred = hist_poisson.predict(X_test) compute_metrics(y_test, hist_poisson_pred) fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), sharey=True) ax1.hist(y_test, bins=30, alpha=0.5) ax1.set_title("Test data") ax2.hist(hist_pred, bins=30, alpha=0.5) ax2.set_title("Default Hist") ax3.hist(hist_poisson_pred, bins=30, alpha=0.5) ax3.set_title("Poisson Hist");
# n = 100: RMSE: 53.12355810994833, R2: 0.8753501517096134 # n = 200: RMSE: 52.54485102807364, R2: 0.8780511319644563 # On Test set # n = 10: RMSE: 170.82834497702, R2: -0.16691087536150695 # n = 100: RMSE: 165.85154036170636, R2: -0.09990921233067707 # n = 200: RMSE: 165.4933971927544, R2: -0.09516400893693411 # %% # Histogram-based Gradient Boosting Regression Tree ## Define/train model hist_boost = HistGradientBoostingRegressor(max_iter = 10000, learning_rate = 0.00001, loss = 'least_squares') hist_boost.fit(X_train, y_train) # Predictions y_hat = hist_boost.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_hat)) r2 = r2_score(y_test, y_hat) print('RMSE: ', rmse) print('R2: ', r2) # On Test set # max_iter=100, lr=0.1: RMSE: 164.95847973452217, R2: -0.08809574406033427 # max_iter=500, lr=0.1: RMSE: 175.5928128791024, R2: -0.23290976591255608 # max_iter=100, lr=0.01: RMSE: 160.164493919987, R2: -0.025770753165551108 # max_iter=500, lr=0.01: RMSE: 162.1181697268406, R2: -0.05094794352731302 # max_iter = 1000, lr=0.0001: RMSE: 158.41835764485302, R2: -0.0035264732961632905 # max_iter = 10000, lr=0.00001: RMSE: 158.41827324121243, R2: -0.003525403959673934 # %%
["revenue"], test_ratio=0.3) X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["revenue"].to_numpy(), y_test_df["revenue"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1129) eval_reg.fit(X_train, y_train) print("-" * 10, "regression report", "-" * 10) report = regression_report(y_test, eval_reg.predict(X_test), X_test.shape[1]) print(report) print("-" * 10, "evaluation of label", "-" * 10) label_df = data.get_true_label( columns=["adr", "revenue", "is_canceled", "label"]) pred_label_df = data.predict_label(eval_reg, X_test_df) print("[ label evaluation ]") report_label = evaluate_by_label(pred_label_df, label_df, target="label") print(report_label) print("[ revenue_per_day evaluation ]") report_revenue = evaluate_by_label(pred_label_df, label_df, target="revenue")
X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["adr"].to_numpy(), y_test_df["adr"].to_numpy(), y_train_df["is_canceled"].to_numpy(), y_test_df["is_canceled"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train_adr.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test_adr.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1129) eval_reg.fit(X_train.copy(), y_train_adr.copy()) print("-" * 10, "regression report", "-" * 10) report = regression_report( y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1] ) print(report) # eval_clf = RandomForestClassifier(random_state=1129) eval_clf = HistGradientBoostingClassifier(random_state=1129) eval_clf.fit(X_train.copy(), y_train_canceled.copy()) print("-" * 10, "classification report", "-" * 10) report = classification_report( y_test_canceled.copy(), eval_clf.predict(X_test.copy()) ) print(report) #%% pred_df = predict(eval_clf, eval_reg, X_test_df) pred_label_df = data.to_label(pred_df)
"""Hydro_Model Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1aMtPXxewSC8pS3Wp1Ay7Z8kthRyY-Ko6 """ import pandas as pd import numpy as np from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor import pickle data = pd.read_csv('clean.csv') X = data.drop(['Unnamed: 0', 'ID', 'Date', 'WQI', 'Label'], axis=1) Y = data['WQI'] X = np.array(X) Y = np.array(Y) model = HistGradientBoostingRegressor().fit(X, Y) model.predict(x) pkl_filename = "pickle_model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(model, file) print("Model Trained and Saved")
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. # - We don't check the absolute_error loss here. This is because # LightGBM's computation of the median (used for the initial value of # raw_prediction) is a bit off (they'll e.g. return midpoints when there # is no need to.). Since these tests only run 1 iteration, the # discrepancy between the initial values leads to biggish differences in # the predictions. These differences are much smaller with more # iterations. pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01
plt.style.use('seaborn-colorblind') p = sns.lineplot(depths, scores_train, label = 'train') p = sns.lineplot(depths, scores_test, label = 'test') p.set_xlabel('Глубина решающего дерева (базового оценщика)', fontsize = 10) p.set_ylabel('Коэффициент детерминации модели', fontsize = 10) p.set_title('Качество стабилизируется с увеличением глубины', fontsize = 15) plt.show() # Изучим результаты лучшего оценщика подробнее reg6 = HistGradientBoostingRegressor(max_depth = 10, max_iter = 500, random_state = 42) reg6.fit(X_train, y_train) # Рисунок 9 -- отклонение наших предсказаний от фактических данных, и не только y_pred6 = reg6.predict(X_test) diff6 = y_test - y_pred6 plt.style.use('seaborn-colorblind') f, axes = plt.subplots(1, 2, figsize = (20, 10), sharex = False) sns.distplot(y_test, hist = True, bins = 100, kde = False, color = 'Green', label = 'Actual', ax = axes[0]) sns.distplot(y_pred6, hist = True, bins = 100, kde = False, color = 'Red', label = 'Predicted', ax = axes[0]) axes[0].legend(loc = 'upper left', frameon = False) axes[0].set_xlabel('', fontsize = 10) axes[0].set_title('Гистограммы доходностей акций', fontsize = 15) sns.distplot(diff6, hist = True, bins = 50, fit = norm, kde = False, color = 'Blue', ax = axes[1]) axes[1].set_xlabel('', fontsize = 10) axes[1].set_title('Остатки модели', fontsize = 15) plt.show()
@author: Jie.Hu """ # dt ========================================================================= ''' 8: Hist Gradient Boosting''' from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor mod_hgb = HistGradientBoostingRegressor(validation_fraction=0.2, n_iter_no_change=20, tol=0.001, random_state=1337) mod_hgb.fit(X_train, y_train) # Predicting the Test set results y_pred = mod_hgb.predict(X_test) mape = np.mean(np.abs( (np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100 #Print model report: print("\nSVM Model Report") print("MAPE : %.2f" % mape) # hyperparameters tuning def my_scorer(y_true, y_pred): mape = np.mean( np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100 return mape my_func = make_scorer(my_scorer, greater_is_better=False)
r_squared = model.score(X_test, y_test) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) print('R-squared: ' + str(r_squared)) print('Mean Squared Error: ' + str(rmse)) def scatter_plot(y_test, y_pred, model_name): plt.figure(figsize=(10, 6)) sns.residplot(y_test, y_pred, lowess=True, color='#4682b4', line_kws={ 'lw': 2, 'color': 'r' }) plt.title(str('Price vs Residuals for ' + model_name)) plt.xlabel('Price', fontsize=16) plt.xticks(fontsize=13) plt.yticks(fontsize=13) plt.show() hist = HistGradientBoostingRegressor() hist.fit(X_train, y_train) y_pred = hist.predict(X_test) rmse(hist, y_test, y_pred, X_train, y_train) scatter_plot(y_test, y_pred, 'Histogram-based Gradient Boosting Regression Tree')
params["max_iter"] = g_search.best_params_["max_iter"] # Tune tree-specific parameters (max_depth, min_samples_leaf) tree_features_param_grid = { "max_depth": range(4, 10), "min_samples_leaf": range(10, 80, 5), } hgbr_2 = HistGradientBoostingRegressor(**params) r_search = RandomizedSearchCV(hgbr_2, param_distributions=tree_features_param_grid) _ = r_search.fit(X_train, y_train) params["max_depth"], params["min_samples_leaf"] = ( r_search.best_params_["max_depth"], r_search.best_params_["min_samples_leaf"], ) # Best model with optimized hyperparameters best_model = HistGradientBoostingRegressor(**params) _ = best_model.fit(X_train, y_train) log_test_preds = best_model.predict(X_test) test_preds = np.exp(log_test_preds) submission_name = "hgbr" submission_string = "../submissions/" + submission_name + "_submission.csv" result_df = pd.DataFrame({"Id": X_test.index, "SalePrice": test_preds}) result_df.to_csv(submission_string, index=False)
rmse_sr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_sr)) all_rmse.iloc[i - 1, 5] = rmse_sr # Voting r1 = LinearRegression() r2 = RandomForestRegressor(n_estimators=100, random_state=0) regr_vr = VotingRegressor([('lr', r1), ('rf', r2)]) regr.fit(x_train_scaled, y_train_scaled) y_pred_vr = regr_vr.predict(x_val_scaled) rmse_vr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_vr)) all_rmse.iloc[i - 1, 6] = rmse_vr # Histogram-based Gradient Boosting regr_hgbr = HistGradientBoostingRegressor(random_state=0) regr.fit(x_train_scaled, y_train_scaled) y_pred_hgbr = regr_hgbr.predict(x_val_scaled) rmse_hgbr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_hgbr)) all_rmse.iloc[i - 1, 7] = rmse_hgbr # LightGBM regr_lgbm = lgb.LGBMRegressor(random_state=0, n_jobs=-1) regr_lgbm.fit(x_train_scaled, y_train_scaled) y_pred_lgbm = regr_lgbm.predict(x_val_scaled) rmse_lgbm = np.sqrt(mean_squared_error(y_val_scaled, y_pred_lgbm)) all_rmse.iloc[i - 1, 8] = rmse_lgbm # XGBoost regr_xgb = XGBRegressor(random_state=0, n_jobs=-1) regr_xgb.fit(x_train_scaled, y_train_scaled) y_pred_xgb = regr_xgb.predict(x_val_scaled) rmse_xgb = np.sqrt(mean_squared_error(y_val_scaled, y_pred_xgb))
task2_df = pd.DataFrame(data=res, columns=[train_labels.columns[11]]) task2_df.to_csv('subtask2.csv', index=False, header=True) t2 = time.time() print('subtask2, time taken: ', t2 - t1) print(task2_df) #subtask 3 t1 = time.time() reg = HistGradientBoostingRegressor(random_state=1510) res = np.empty(shape=(len(patient_test_data), 4)) for feat in range(12, 16): reg.fit(patient_train_data, train_labels.to_numpy()[:, feat]) res[:, feat - 12] = reg.predict(patient_test_data) task3_df = pd.DataFrame(data=res, columns=train_labels.columns[12:]) task3_df.to_csv('subtask3.csv', index=False, header=True) t2 = time.time() print('subtask3, time taken: ', t2 - t1) print(task3_df) #task1_df = pd.read_csv('subtask1.csv') #task2_df = pd.read_csv('subtask2.csv') #task3_df = pd.read_csv('subtask3.csv') #combine results pids = pd.DataFrame(data=test_features.pid.unique(), columns=['pid'])
predictors_train = train_prepared.iloc[:,:-1] labels_train = np.array(train_prepared.iloc[:,-1:]).flatten() predictors_test = test_prepared.iloc[:,:-1] labels_test = np.array(test_prepared.iloc[:,-1:]).flatten() # Model # model = BayesianRidge(alpha_1 = 5.661182937742398, alpha_2 = 8.158544161338462, lambda_1 = 7.509288525874375, lambda_2 = 0.08383802954777253) model = HistGradientBoostingRegressor(l2_regularization=0.1923237939031256, learning_rate=0.10551346041298326, loss='least_absolute_deviation', max_depth=4, max_leaf_nodes=32, min_samples_leaf=4, warm_start=False) # model = HistGradientBoostingRegressor(l2_regularization=0.02021888460670551, learning_rate=0.04277282248041758, # loss='least_squares', max_depth=4, max_leaf_nodes=32, min_samples_leaf=16, # warm_start=True) # model = SVR(C=2.9468542209755357, coef0=-0.6868465520687694, degree=4, epsilon=0.18702907953343395, gamma=0.1632449384464454, kernel='rbf', shrinking=True) model.fit(predictors_train, labels_train) yhat = model.predict(predictors_test) # Linear regression # scatter(yhat, labels_test) # plt.xlabel("Prediction ML", fontsize=16) # plt.ylabel("log(Visitation Rate)", fontsize=16) m, b = np.polyfit(yhat, labels_test, 1) # plot(yhat, yhat) X_reg, y_reg = yhat.reshape(-1, 1), labels_test.reshape(-1, 1) reg = LinearRegression().fit(X_reg, y_reg) reg.score(X_reg, y_reg) # Density difference (observed-predicted), organic vs not-organic kwargs = dict(hist_kws={'alpha': .4}, kde_kws={'linewidth': 1}) plt.figure() df = pd.DataFrame({'obs':labels_test, 'pred':yhat, 'is_organic':[ x == 3 for x in test_management.management ]})
sys.path.append(project_path) ### with open('car_price_feat.txt') as f: feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n'))) ### data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv') data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv') ### series_name = '宝马5系' d_train = data_train[data_train.model_series == series_name] d_test = data_test[data_test.model_series == series_name] ### label_encode_map, f_map = DataProcess.gencode( pd.concat([data_train, data_test]), feat_list) en_train, en_test = DataProcess.encode_process( d_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(d_test[feat_list], feat_list, label_encode_map) #### est = HistGradientBoostingRegressor(max_iter=200, learning_rate=0.3, max_depth=6, min_samples_leaf=20, max_leaf_nodes=40) est.fit(en_train, d_train.price) pred = est.predict(en_test) evaluate(d_test, pred) ### R2 print(est.score(en_test, d_test.price))
def train(self): write_save_log("start to train") correct_answer = [] first_attempt_index = list(self.columns).index('Correct First Attempt') for row in self.train_data.values: re_cor = row[first_attempt_index] if np.isnan(re_cor): re_cor = 0 correct_answer.append(re_cor) correct_answer = np.array(correct_answer) features = pd.read_csv("./data/feature.csv") # for col in features.columns: # print(features[col].describe()) with open('./data/intelligent_table.json', 'r') as f: intelligent_table = json.loads(f.read()) with open('./data/kc_difficulty.json', 'r') as f: kc_table = json.loads(f.read()) with open('./data/problem.json', 'r') as f: problem_table = json.loads(f.read()) # generate feature for test dataa test_features_pd = pd.DataFrame() problem_unit = [] problem_section = [] problem_values = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) unit = dict_re["Problem Hierarchy"].split(", ")[0] unit = re.sub("Unit ", "", unit) section = dict_re["Problem Hierarchy"].split(", ")[1] section = re.sub("Section ", "", section) problem_unit.append(unit) problem_section.append(section) if dict_re["Step Name"] in problem_table.keys(): problem_values.append(problem_table[dict_re["Step Name"]]) else: problem_values.append(problem_table['mean']) self.test_data["Problem Unit"] = problem_unit self.test_data["Problem Section"] = problem_section # one hot encoder ID self.hash_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) write_save_log("ID feature generated") # hash encoder Problem Name self.hash_encoder_generator(test_features_pd, "Problem Name", self.test_data) write_save_log("Problem Name feature generated") # hash encoder Problem Unit self.hash_encoder_generator(test_features_pd, "Problem Unit", self.test_data) write_save_log("Problem Unit feature generated") # hash encoder Problem Section self.hash_encoder_generator(test_features_pd, "Problem Section", self.test_data) write_save_log("Problem Section feature generated") # directly add problem view test_features_pd["Problem View"] = self.test_data["Problem View"] self.hash_encoder_generator(test_features_pd, "Step Name", self.test_data) intel_values = [] kc_values = [] test_answer = [] kc_length = [] index_count = 0 remove_list = [] oppo_feature = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) if np.isnan(dict_re["Correct First Attempt"]): remove_list.append(index_count) else: test_answer.append(dict_re["Correct First Attempt"]) intel_values.append(intelligent_table[dict_re["Anon Student Id"]]) stu_kc = dict_re["KC(Default)"] sum_difficult = 0 kc_num = 0 oppo_value = 0 if type(stu_kc) == str: oppo_list = dict_re["Opportunity(Default)"].split("~~") kc_num = len(stu_kc.split("~~")) for true_kc in stu_kc.split("~~"): oppo_value += int(oppo_list[stu_kc.split("~~").index( true_kc)]) * kc_table[true_kc] sum_difficult += kc_table[true_kc] sum_difficult /= len(stu_kc.split("~~")) else: oppo_value = kc_table["mean"] sum_difficult = kc_table["mean"] kc_values.append(sum_difficult) kc_length.append(kc_num) oppo_feature.append(oppo_value) index_count += 1 test_features_pd["kc difficulty"] = kc_values test_features_pd["kc number"] = kc_length test_features_pd["person_intelligent"] = intel_values test_features_pd["oppo value"] = oppo_feature test_features_pd['Problem difficulty'] = problem_values test_features_pd.drop(remove_list, inplace=True) parameter_range = { "random_state": [i for i in range(0, 40)], "max_iter": [i for i in range(100, 500)], "loss": ['least_squares', 'least_absolute_deviation', 'poisson'], "learning_rate": [0.1 * i for i in range(1, 7)], "l2_regularization": [0.1 * i for i in range(1, 10)], } best_score = 1 bes_policy = {} while best_score > 0.35: random_state = {} for key, value in parameter_range.items(): random_state[key] = random.sample(value, 1) write_save_log(random_state) # clf1 = HistGradientBoostingRegressor() # clf2 = AdaBoostRegressor() # # clf = VotingRegressor(estimators=[('hgb', clf1), ('rf', clf2)], weights=[2, 1]) clf = HistGradientBoostingRegressor( random_state=random_state["random_state"][0], max_iter=random_state["max_iter"][0], loss=random_state['loss'][0], learning_rate=random_state['learning_rate'][0], l2_regularization=random_state['l2_regularization'][0]) clf.fit(features.values, correct_answer) for i in range(len(test_features_pd.columns)): if test_features_pd.columns[i] != features.columns[i]: raise KeyError("feature order error!") res = clf.predict(test_features_pd.values) re_res = [] for i in res: if i >= 0.5: re_res.append(1) else: re_res.append(0) re_score = MSER(re_res, test_answer) write_save_log("result error: {}".format(re_score)) if best_score > re_score: best_score = re_score bes_policy = copy.deepcopy(random_state) write_save_log("\nbest policy and score\n" + str(bes_policy)) write_save_log(str(best_score) + '\n')