def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure sklearn has the same predictions as lightgbm for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and sklearn should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_regression(n_samples=n_samples, n_features=5,
                           n_informative=5, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingRegressor(
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_HistGradientBoostingRegressor():
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingRegressor

    # train a tree-based model
    X, y = shap.datasets.diabetes()
    model = HistGradientBoostingRegressor(max_iter=1000, max_depth=6).fit(X, y)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    assert np.max(
        np.abs(
            shap_values.sum(1) + explainer.expected_value -
            model.predict(X))) < 1e-4
Exemple #3
0
    def onnxrt_python_RandomForestRegressor_dtype(
            self, dtype, n=37, full=False, use_hist=False, ntrees=10,
            runtime='python'):
        iris = load_iris()
        X, y = iris.data, iris.target
        X_train, X_test, y_train, _ = train_test_split(
            X, y, random_state=11 if not full else 13)
        X_test = X_test.astype(dtype)
        if use_hist:
            if full:
                clr = HistGradientBoostingRegressor()
            else:
                clr = HistGradientBoostingRegressor(
                    max_iter=ntrees, max_depth=4)
        else:
            if full:
                clr = RandomForestRegressor(n_jobs=1)
            else:
                clr = RandomForestRegressor(
                    n_estimators=ntrees, n_jobs=1, max_depth=4)

        clr.fit(X_train, y_train)

        model_def = to_onnx(clr, X_train.astype(dtype),
                            rewrite_ops=True)
        oinf = OnnxInference(model_def)

        text = "\n".join(map(lambda x: str(x.ops_), oinf.sequence_))
        self.assertIn("TreeEnsembleRegressor", text)
        if full:
            n = 34
            X_test = X_test[n:n + 5]
        else:
            n = 37
            X_test = X_test[n:n + 5]
        X_test = numpy.vstack([X_test, X_test[:1].copy() * 1.01,
                               X_test[:1].copy() * 0.99])
        y = oinf.run({'X': X_test})
        self.assertEqual(list(sorted(y)), ['variable'])
        lexp = clr.predict(X_test)
        if dtype == numpy.float32:
            self.assertEqualArray(lexp, y['variable'], decimal=5)
        else:
            try:
                self.assertEqualArray(lexp, y['variable'])
            except AssertionError as e:
                raise AssertionError(
                    "---------\n{}\n-----".format(model_def)) from e
        self.assertEqual(oinf.sequence_[0].ops_.rt_.same_mode_, True)
        self.assertNotEmpty(oinf.sequence_[0].ops_.rt_.nodes_modes_)
def test_zero_sample_weights_regression():
    # Make sure setting a SW to zero amounts to ignoring the corresponding
    # sample

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    gb = HistGradientBoostingRegressor(min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert gb.predict([[1, 0]])[0] > 0.5
Exemple #5
0
def GBM(X_train, X_test, y_train, y_test, loss, mode):

    parameters = {
        'max_depth': 40,
        'min_samples_leaf': 1,
        'learning_rate': 0.01,
        'loss': loss
    }
    GradientBoostingRegressorObject = HistGradientBoostingRegressor(
        random_state=1, **parameters)

    if mode == 'val':
        X = X_train.append(X_test)
        y = np.append(y_train, y_test)
        y_pred = cross_val_predict(GradientBoostingRegressorObject, X, y, cv=5)
        y_prediction = y_pred[-len(y_test):]
        y_prediction_train = y_pred[:len(y_train)]

    if mode == 'test':
        GradientBoostingRegressorObject.fit(X_train, y_train)
        y_prediction = GradientBoostingRegressorObject.predict(X_test)
        y_prediction_train = GradientBoostingRegressorObject.predict(X_train)

    return y_prediction, y_prediction_train
Exemple #6
0
def forecast_hgbr(df, forecast, day, seed, num_epochs, all):
    # don't try to predict pv at night!
    # ( create the model only using this zenith, but forecast all
    #   points when making the prediction as the forecast day may
    #   have different zenith, and hence different values )
    day_df = df[df['zenith'] < 87]
    # set up inputs
    input_df = ann_inputs(day_df, all)

    # set up output
    output_column = 'pv_power'
    output = day_df[output_column]

    X_train = input_df.values
    y_train = output.values.reshape(len(output), 1)

    print('Creating Regressor ...')
    # loss='ls' or 'lad'
    reg = HistGradientBoostingRegressor(max_iter=500)
    print('Fitting model ...')
    reg.fit(X_train, y_train)

    print('Making prediction ...')
    # although the model is only trained on day time ( zenith>87)
    # we forecast the whole day as it doesn't do any harm.
    forecast_day = forecast.loc[day.strftime('%Y-%m-%d')].copy()
    input_f = ann_inputs(forecast_day, all)
    X_test = input_f.values

    # prediction
    sk_pred = reg.predict(X_test)
    print('Prediction completed ...')

    forecast_day['prediction'] = sk_pred
    forecast_day.loc[forecast_day['zenith'] > 87, 'prediction'] = 0.0
    forecast.loc[day.strftime('%Y-%m-%d'),
                 'prediction'] = forecast_day['prediction']
Exemple #7
0
def q3_main(df, df_train):
    # df_train = pd.read_csv('./timeseries_train.csv')
    # df_train = df.dropna()

    x_train = df_train[df_train.columns[3:15]]
    y_train = df_train['WQI']

    x_test = df_train[df_train.columns[3:15]]

    sc_X = StandardScaler()
    x_train = sc_X.fit_transform(x_train)
    x_test = sc_X.transform(x_test)

    regressor_gb = HistGradientBoostingRegressor()
    y_pred = np.array([])
    regressor_gb.fit(x_train, y_train)
    y_pred = regressor_gb.predict(x_test)

    # print(y_pred)
    df_val = pd.DataFrame({'Predicted WQI': y_pred})
    df['WQI'] = df_val

    # print(df)
    return df
Exemple #8
0
model.fit(X, y)

### Machine learning
x2_all = np.arange(34)
y2_all = np.zeros_like(x2_all)

results2 = {}
for v in variants:
    idx = np.where(dirty_jt == v)[0]
    n_emp = len(idx)
    xc = X[idx[0]]
    unq_sen = np.arange(34)
    mean_salary = []
    for k, sen in enumerate(unq_sen):
        xc[300] = sen
        mean_salary.append(model.predict(xc[np.newaxis, :])[0])
    results2[v] = unq_sen, np.array(mean_salary)
    y2_all = y2_all + np.array(mean_salary)

y2_all /= len(variants)

variants2 = [
    '0361 project manager', '2128 project manager', '9109 project manager',
    'manager project', 'mgr project', 'project manager'
]

### Plug-in estimates
results3 = {}
for v in variants2:
    m1 = (dirty_jt == v)
    seniority_cat, y_cat = Xs[m1, 300], salary[m1]
Exemple #9
0
            learning_rate=0.1,
            loss='least_squares',
            # max_leaf_nodes=51,
            # min_samples_leaf=20,
            # l2_regularization=200,
            verbose=0
        )
        model.fit(train_X, train_Y)
        print("models fit")
        print("train_X length: ", len(train_X))
        print("test_X length: ", len(test_X))
        print("train MIN:{0} MAX:{1}".format(str(min(train_Y)), str(max(train_Y))))
        print("test MIN:{0} MAX:{1}".format(str(min(test_Y)), str(max(test_Y))))

        start = time.time()
        pred_Y = model.predict(test_X)
        end = time.time()
        T_i += end - start
        R2_score_avg[k_i - 1] = r2_score(test_Y, pred_Y)
        mae_avg[k_i - 1] = mean_absolute_error(test_Y, pred_Y)
        mse_avg[k_i - 1] = mean_squared_error(test_Y, pred_Y)
        mape_avg[k_i - 1] = np.mean(np.abs((pred_Y - test_Y) / test_Y)) * 100
        my_metric_avg[k_i - 1] = 0
        temp_min = 0
        temp_min_i = 0
        for y_i in range(len(test_Y)):
            if test_Y[y_i] == 0:
                continue
            if np.abs(test_Y[y_i] - pred_Y[y_i]) / test_Y[y_i] > temp_min:
                temp_min = np.abs(test_Y[y_i] - pred_Y[y_i]) / test_Y[y_i]
                temp_min_i = y_i
Exemple #10
0
def test_missing_values_minmax_imputation():
    # Compare the buit-in missing value handling of Histogram GBC with an
    # a-priori missing value imputation strategy that should yield the same
    # results in terms of decision function.
    #
    # Each feature (containing NaNs) is replaced by 2 features:
    # - one where the nans are replaced by min(feature) - 1
    # - one where the nans are replaced by max(feature) + 1
    # A split where nans go to the left has an equivalent split in the
    # first (min) feature, and a split where nans go to the right has an
    # equivalent split in the second (max) feature.
    #
    # Assuming the data is such that there is never a tie to select the best
    # feature to split on during training, the learned decision trees should be
    # strictly equivalent (learn a sequence of splits that encode the same
    # decision function).
    #
    # The MinMaxImputer transformer is meant to be a toy implementation of the
    # "Missing In Attributes" (MIA) missing value handling for decision trees
    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
    # The implementation of MIA as an imputation transformer was suggested by
    # "Remark 3" in :arxiv:'<1902.06931>`

    class MinMaxImputer(TransformerMixin, BaseEstimator):
        def fit(self, X, y=None):
            mm = MinMaxScaler().fit(X)
            self.data_min_ = mm.data_min_
            self.data_max_ = mm.data_max_
            return self

        def transform(self, X):
            X_min, X_max = X.copy(), X.copy()

            for feature_idx in range(X.shape[1]):
                nan_mask = np.isnan(X[:, feature_idx])
                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1

            return np.concatenate([X_min, X_max], axis=1)

    def make_missing_value_data(n_samples=int(1e4), seed=0):
        rng = np.random.RandomState(seed)
        X, y = make_regression(n_samples=n_samples,
                               n_features=4,
                               random_state=rng)

        # Pre-bin the data to ensure a deterministic handling by the 2
        # strategies and also make it easier to insert np.nan in a structured
        # way:
        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)

        # First feature has missing values completely at random:
        rnd_mask = rng.rand(X.shape[0]) > 0.9
        X[rnd_mask, 0] = np.nan

        # Second and third features have missing values for extreme values
        # (censoring missingness):
        low_mask = X[:, 1] == 0
        X[low_mask, 1] = np.nan

        high_mask = X[:, 2] == X[:, 2].max()
        X[high_mask, 2] = np.nan

        # Make the last feature nan pattern very informative:
        y_max = np.percentile(y, 70)
        y_max_mask = y >= y_max
        y[y_max_mask] = y_max
        X[y_max_mask, 3] = np.nan

        # Check that there is at least one missing value in each feature:
        for feature_idx in range(X.shape[1]):
            assert any(np.isnan(X[:, feature_idx]))

        # Let's use a test set to check that the learned decision function is
        # the same as evaluated on unseen data. Otherwise it could just be the
        # case that we find two independent ways to overfit the training set.
        return train_test_split(X, y, random_state=rng)

    # n_samples need to be large enough to minimize the likelihood of having
    # several candidate splits with the same gain value in a given tree.
    X_train, X_test, y_train, y_test = make_missing_value_data(
        n_samples=int(1e4), seed=0)

    # Use a small number of leaf nodes and iterations so as to keep
    # under-fitting models to minimize the likelihood of ties when training the
    # model.
    gbm1 = HistGradientBoostingRegressor(max_iter=100,
                                         max_leaf_nodes=5,
                                         random_state=0)
    gbm1.fit(X_train, y_train)

    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
    gbm2.fit(X_train, y_train)

    # Check that the model reach the same score:
    assert gbm1.score(X_train,
                      y_train) == pytest.approx(gbm2.score(X_train, y_train))

    assert gbm1.score(X_test,
                      y_test) == pytest.approx(gbm2.score(X_test, y_test))

    # Check the individual prediction match as a finer grained
    # decision function check.
    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
Exemple #11
0
print('R2 score is {}'.format(test_score_r2))
print()
print("Best parameters set found on development set:")
print(gs.best_params_)
print()

# Re-train with best parameters
regr = HistGradientBoostingRegressor(**gs.best_params_)

t0 = time.time()
regr.fit(x_train, y_train.ravel())
regr_fit = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit)

t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

with open('output.log', 'w') as f:
    print("Training time: %.6f s" % regr_fit, file=f)
    print("Prediction time: %.6f s" % regr_predict, file=f)
    print(" ", file=f)
    print("The model performance for training set", file=f)
    print("--------------------------------------", file=f)
    print('MAE is {}'.format(train_score_mae), file=f)
    print('MSE is {}'.format(train_score_mse), file=f)
    print('EVS is {}'.format(train_score_evs), file=f)
    print('ME is {}'.format(train_score_me), file=f)
    print('R2 score is {}'.format(train_score_r2), file=f)
    print(" ", file=f)
Exemple #12
0
#----------------------------------------------------------------------
# Treinar e testar um regressor HistGradientBoosting
#----------------------------------------------------------------------

print(' ')
print(' REGRESSOR HIST GRADIENT BOOSTING:')
print(' ')

hgb = HistGradientBoostingRegressor(l2_regularization=12.0,
                                    max_iter=70,
                                    learning_rate=0.1,
                                    loss='least_absolute_deviation')

hgb = hgb.fit(x_treino, y_treino)

y_resposta_treino = hgb.predict(x_treino)
y_resposta_teste = hgb.predict(x_teste)

print(' Métrica   DENTRO da amostra   FORA da amostra ')
print(' -------   -----------------   --------------- ')

mse_in = mean_squared_error(y_treino, y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in = r2_score(y_treino, y_resposta_treino)
medae_in = median_absolute_error(y_treino, y_resposta_treino)
msle_in = mean_squared_log_error(y_treino, y_resposta_treino)
rmspe_in = rmspe(y_treino, y_resposta_treino)

mse_out = mean_squared_error(y_teste, y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out = r2_score(y_teste, y_resposta_teste)
Exemple #13
0
        tree_method='hist',
    )
    model.fit(features_train, labels_train)
elif args.library == 'catboost':
    from catboost import CatBoostRegressor
    model = CatBoostRegressor(grow_policy='Lossguide',
                              learning_rate=0.1,
                              n_estimators=100,
                              num_leaves=255,
                              train_dir='data/catboost_info',
                              verbose=False)
    model.fit(features_train, labels_train, silent=True)

# Make predictions on the test data.
if args.library == 'h2o':
    predictions = model.predict(data_test).as_data_frame()
else:
    predictions = model.predict(features_test)

# Compute metrics.
mse = mean_squared_error(predictions, labels_test)

# Compute memory usage.
f = open("/proc/self/status", "r")
for line in f.readlines():
    if line.startswith("VmHWM"):
        memory = line.split(":")[1].strip()

print(json.dumps({
    'mse': mse,
    'memory': memory,
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["adr"].to_numpy(),
        y_test_df["adr"].to_numpy(),
        y_train_df["is_canceled"].to_numpy(),
        y_test_df["is_canceled"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train_adr.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test_adr.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1126)
    eval_reg.fit(X_train.copy(), y_train_adr.copy())
    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(y_test_adr.copy(),
                               eval_reg.predict(X_test.copy()),
                               X_test.shape[1])
    print(report)

    eval_clf = RandomForestClassifier(n_estimators=150,
                                      max_depth=50,
                                      random_state=1126)
    # eval_clf = HistGradientBoostingClassifier(random_state=1129)
    eval_clf.fit(X_train.copy(), y_train_canceled.copy())
    print("-" * 10, "classification report", "-" * 10)
    report = classification_report(y_test_canceled.copy(),
                                   eval_clf.predict(X_test.copy()))
    print(report)

    #%%
    pred_df = predict(eval_clf, eval_reg, X_test_df)
Exemple #15
0
    def test_pipeline(target_options, data_options, model_options, out_options,
                      plot_options):
        # target_options: function + function domain + function representation + function short name
        # data options: data length
        # model options: bins + patience + early stopping delta + ...
        # out options: verbose

        # generate dataset
        x_train, y_train, x_valid, y_valid = TestHelper.__generate_data_uniform(
            target_options, data_options)

        # print experiment conditions
        if out_options['verbose'] >= 3:
            TestHelper.__print_conditions(model_options, data_options,
                                          x_train.shape[1])

        # fit model
        def fit_wrapper():
            model = regbm.Boosting(model_options['min_bins'],
                                   model_options['max_bins'],
                                   model_options['patience'], False,
                                   THREAD_COUNT)
            start_time = time.time(
            )  # get start time to count the time of execution
            history = model.fit(
                x_train, y_train, x_valid, y_valid,
                model_options['tree_count'], model_options['tree_depth'],
                model_options['feature_fold_size'],
                model_options['learning_rate'], model_options['reg'],
                model_options['es_delta'], model_options['batch_part'],
                model_options['random_batches'],
                model_options['random_hist_thresholds'],
                model_options['remove_regularization_later'])
            exec_time = time.time() - start_time
            if out_options['verbose'] >= 1:
                print(f"Fit time = {exec_time} seconds")
            return model, history

        history = None
        model = None
        model, history = fit_wrapper()

        if out_options['verbose'] >= 1:
            print(f"Model fit finished")
        if out_options['verbose'] >= 3:
            print(f"Real tree count: {history.trees_number()}")

        # fit Sklearn model to compare with
        if out_options["sklearn"]:
            sk_model = HistGradientBoostingRegressor(
                learning_rate=model_options['learning_rate'],
                max_depth=model_options['tree_depth'],
                max_iter=model_options['tree_count'])
            sk_model.fit(x_train, y_train)

        # evaluate both models
        preds = model.predict(x_valid)

        if out_options['verbose'] >= 2:
            print("Evaluation:")
            model_mae = mae(y_valid, preds)
            print(f"regbm model MAE: {model_mae}")

        if out_options['sklearn'] and out_options['verbose'] >= 2:
            sk_preds = sk_model.predict(x_valid)
            sklearn_mae = mae(y_valid, sk_preds)
            print(f"Sklearn model MAE: {sklearn_mae}")
            print(f"Sklearn better {model_mae / sklearn_mae} times")

        # make plots
        if plot_options['need_plots']:
            model_name = TestHelper.__get_model_name(target_options,
                                                     model_options)
            TestHelper.__plot_losses(history, model_name + "_loss")
            TestHelper.__plot_predictions(
                target_options, model, plot_options, model_name + "_pred",
                sk_model if out_options["sklearn"] else None)
Exemple #16
0
    # preprocess the data
    X_train, X_val, X_test, y_train, y_val, y_test = preprocess_serial()

    print("X_train.shape, y_train.shape")
    print(X_train.shape, y_train.shape)
    print("X_test.shape, y_test.shape")
    print(X_test.shape, y_test.shape)

    # create model using the best hyperparameters found in the parallel implementation
    model = HistGradientBoostingRegressor(learning_rate=0.5, max_depth=8)

    # fit data
    start = time()
    model.fit(X_train, y_train)
    dt = time() - start
    print("Time to fit: %f" % dt)

    # predict data
    y_pred = model.predict(X_test)

    # calculate stats
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"MSE: {mse}, MAE: {mae}")

    # save predictions to csv
    pred_df = pd.DataFrame(y_pred, columns=['Weighted_Price'])
    pred_df.to_csv('../data/predictions/gboost_sklearn_y_pred.csv',
                   index=False)
    print("Done")
Exemple #17
0
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

hist = HistGradientBoostingRegressor(random_state=42)
hist.fit(X_train, y_train)
hist_pred = hist.predict(X_test)

compute_metrics(y_test, hist_pred)

hist_poisson = HistGradientBoostingRegressor(loss='poisson', random_state=42)
hist_poisson.fit(X_train, y_train)

hist_poisson_pred = hist_poisson.predict(X_test)

compute_metrics(y_test, hist_poisson_pred)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
ax1.hist(y_test, bins=30, alpha=0.5)
ax1.set_title("Test data")
ax2.hist(hist_pred, bins=30, alpha=0.5)
ax2.set_title("Default Hist")
ax3.hist(hist_poisson_pred, bins=30, alpha=0.5)
ax3.set_title("Poisson Hist");
# n = 100: RMSE:  53.12355810994833, R2:  0.8753501517096134
# n = 200: RMSE:  52.54485102807364, R2:  0.8780511319644563

# On Test set
# n = 10: RMSE:  170.82834497702, R2:  -0.16691087536150695
# n = 100: RMSE:  165.85154036170636, R2:  -0.09990921233067707
# n = 200: RMSE:  165.4933971927544, R2:  -0.09516400893693411

# %%
# Histogram-based Gradient Boosting Regression Tree
## Define/train model
hist_boost = HistGradientBoostingRegressor(max_iter = 10000, learning_rate = 0.00001, loss = 'least_squares')
hist_boost.fit(X_train, y_train)

# Predictions
y_hat = hist_boost.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_hat))
r2 = r2_score(y_test, y_hat)

print('RMSE: ', rmse) 
print('R2: ', r2)
# On Test set
# max_iter=100, lr=0.1: RMSE:  164.95847973452217, R2:  -0.08809574406033427
# max_iter=500, lr=0.1: RMSE:  175.5928128791024, R2:  -0.23290976591255608
# max_iter=100, lr=0.01: RMSE:  160.164493919987, R2:  -0.025770753165551108
# max_iter=500, lr=0.01: RMSE:  162.1181697268406, R2:  -0.05094794352731302
# max_iter = 1000, lr=0.0001: RMSE:  158.41835764485302, R2:  -0.0035264732961632905
# max_iter = 10000, lr=0.00001: RMSE:  158.41827324121243, R2:  -0.003525403959673934

# %%
Exemple #19
0
        ["revenue"], test_ratio=0.3)
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train, y_train)

    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(y_test, eval_reg.predict(X_test),
                               X_test.shape[1])
    print(report)

    print("-" * 10, "evaluation of label", "-" * 10)
    label_df = data.get_true_label(
        columns=["adr", "revenue", "is_canceled", "label"])
    pred_label_df = data.predict_label(eval_reg, X_test_df)

    print("[ label evaluation ]")
    report_label = evaluate_by_label(pred_label_df, label_df, target="label")
    print(report_label)
    print("[ revenue_per_day evaluation ]")
    report_revenue = evaluate_by_label(pred_label_df,
                                       label_df,
                                       target="revenue")
Exemple #20
0
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["adr"].to_numpy(),
        y_test_df["adr"].to_numpy(),
        y_train_df["is_canceled"].to_numpy(),
        y_test_df["is_canceled"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train_adr.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test_adr.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train.copy(), y_train_adr.copy())
    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(
        y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1]
    )
    print(report)

    # eval_clf = RandomForestClassifier(random_state=1129)
    eval_clf = HistGradientBoostingClassifier(random_state=1129)
    eval_clf.fit(X_train.copy(), y_train_canceled.copy())
    print("-" * 10, "classification report", "-" * 10)
    report = classification_report(
        y_test_canceled.copy(), eval_clf.predict(X_test.copy())
    )
    print(report)

    #%%
    pred_df = predict(eval_clf, eval_reg, X_test_df)
    pred_label_df = data.to_label(pred_df)
Exemple #21
0
"""Hydro_Model

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aMtPXxewSC8pS3Wp1Ay7Z8kthRyY-Ko6
"""

import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import pickle

data = pd.read_csv('clean.csv')

X = data.drop(['Unnamed: 0', 'ID', 'Date', 'WQI', 'Label'], axis=1)

Y = data['WQI']

X = np.array(X)
Y = np.array(Y)

model = HistGradientBoostingRegressor().fit(X, Y)
model.predict(x)

pkl_filename = "pickle_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
print("Model Trained and Saved")
Exemple #22
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure sklearn has the same predictions as lightgbm for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and sklearn should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.
    # - We don't check the absolute_error loss here. This is because
    #   LightGBM's computation of the median (used for the initial value of
    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
    #   is no need to.). Since these tests only run 1 iteration, the
    #   discrepancy between the initial values leads to biggish differences in
    #   the predictions. These differences are much smaller with more
    #   iterations.
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingRegressor(
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01
Exemple #23
0
plt.style.use('seaborn-colorblind')
p = sns.lineplot(depths, scores_train, label = 'train')
p = sns.lineplot(depths, scores_test, label = 'test')
p.set_xlabel('Глубина решающего дерева (базового оценщика)', fontsize = 10)
p.set_ylabel('Коэффициент детерминации модели', fontsize = 10)
p.set_title('Качество стабилизируется с увеличением глубины', fontsize = 15)

plt.show()


# Изучим результаты лучшего оценщика подробнее
reg6 = HistGradientBoostingRegressor(max_depth = 10, max_iter = 500, random_state = 42)
reg6.fit(X_train, y_train)

# Рисунок 9 -- отклонение наших предсказаний от фактических данных, и не только
y_pred6 = reg6.predict(X_test)
diff6 = y_test - y_pred6

plt.style.use('seaborn-colorblind')
f, axes = plt.subplots(1, 2, figsize = (20, 10), sharex = False)
sns.distplot(y_test, hist = True, bins = 100, kde = False, color = 'Green', label = 'Actual', ax = axes[0])
sns.distplot(y_pred6, hist = True, bins = 100, kde = False, color = 'Red', label = 'Predicted', ax = axes[0])
axes[0].legend(loc = 'upper left', frameon = False)
axes[0].set_xlabel('', fontsize = 10)
axes[0].set_title('Гистограммы доходностей акций', fontsize = 15)
sns.distplot(diff6, hist = True, bins = 50, fit = norm, kde = False, color = 'Blue', ax = axes[1])
axes[1].set_xlabel('', fontsize = 10)
axes[1].set_title('Остатки модели', fontsize = 15)

plt.show()
@author: Jie.Hu
"""

# dt =========================================================================
''' 8: Hist Gradient Boosting'''
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
mod_hgb = HistGradientBoostingRegressor(validation_fraction=0.2,
                                        n_iter_no_change=20,
                                        tol=0.001,
                                        random_state=1337)
mod_hgb.fit(X_train, y_train)

# Predicting the Test set results
y_pred = mod_hgb.predict(X_test)
mape = np.mean(np.abs(
    (np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100
#Print model report:
print("\nSVM Model Report")
print("MAPE : %.2f" % mape)


# hyperparameters tuning
def my_scorer(y_true, y_pred):
    mape = np.mean(
        np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100
    return mape


my_func = make_scorer(my_scorer, greater_is_better=False)
Exemple #25
0
    r_squared = model.score(X_test, y_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print('R-squared: ' + str(r_squared))
    print('Mean Squared Error: ' + str(rmse))


def scatter_plot(y_test, y_pred, model_name):
    plt.figure(figsize=(10, 6))
    sns.residplot(y_test,
                  y_pred,
                  lowess=True,
                  color='#4682b4',
                  line_kws={
                      'lw': 2,
                      'color': 'r'
                  })
    plt.title(str('Price vs Residuals for ' + model_name))
    plt.xlabel('Price', fontsize=16)
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.show()


hist = HistGradientBoostingRegressor()
hist.fit(X_train, y_train)
y_pred = hist.predict(X_test)

rmse(hist, y_test, y_pred, X_train, y_train)
scatter_plot(y_test, y_pred,
             'Histogram-based Gradient Boosting Regression Tree')
Exemple #26
0
params["max_iter"] = g_search.best_params_["max_iter"]

# Tune tree-specific parameters (max_depth, min_samples_leaf)
tree_features_param_grid = {
    "max_depth": range(4, 10),
    "min_samples_leaf": range(10, 80, 5),
}
hgbr_2 = HistGradientBoostingRegressor(**params)
r_search = RandomizedSearchCV(hgbr_2,
                              param_distributions=tree_features_param_grid)
_ = r_search.fit(X_train, y_train)

params["max_depth"], params["min_samples_leaf"] = (
    r_search.best_params_["max_depth"],
    r_search.best_params_["min_samples_leaf"],
)

# Best model with optimized hyperparameters
best_model = HistGradientBoostingRegressor(**params)
_ = best_model.fit(X_train, y_train)

log_test_preds = best_model.predict(X_test)
test_preds = np.exp(log_test_preds)

submission_name = "hgbr"
submission_string = "../submissions/" + submission_name + "_submission.csv"

result_df = pd.DataFrame({"Id": X_test.index, "SalePrice": test_preds})
result_df.to_csv(submission_string, index=False)
Exemple #27
0
    rmse_sr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_sr))
    all_rmse.iloc[i - 1, 5] = rmse_sr

    # Voting
    r1 = LinearRegression()
    r2 = RandomForestRegressor(n_estimators=100, random_state=0)
    regr_vr = VotingRegressor([('lr', r1), ('rf', r2)])
    regr.fit(x_train_scaled, y_train_scaled)
    y_pred_vr = regr_vr.predict(x_val_scaled)
    rmse_vr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_vr))
    all_rmse.iloc[i - 1, 6] = rmse_vr

    # Histogram-based Gradient Boosting
    regr_hgbr = HistGradientBoostingRegressor(random_state=0)
    regr.fit(x_train_scaled, y_train_scaled)
    y_pred_hgbr = regr_hgbr.predict(x_val_scaled)
    rmse_hgbr = np.sqrt(mean_squared_error(y_val_scaled, y_pred_hgbr))
    all_rmse.iloc[i - 1, 7] = rmse_hgbr

    # LightGBM
    regr_lgbm = lgb.LGBMRegressor(random_state=0, n_jobs=-1)
    regr_lgbm.fit(x_train_scaled, y_train_scaled)
    y_pred_lgbm = regr_lgbm.predict(x_val_scaled)
    rmse_lgbm = np.sqrt(mean_squared_error(y_val_scaled, y_pred_lgbm))
    all_rmse.iloc[i - 1, 8] = rmse_lgbm

    # XGBoost
    regr_xgb = XGBRegressor(random_state=0, n_jobs=-1)
    regr_xgb.fit(x_train_scaled, y_train_scaled)
    y_pred_xgb = regr_xgb.predict(x_val_scaled)
    rmse_xgb = np.sqrt(mean_squared_error(y_val_scaled, y_pred_xgb))
Exemple #28
0
task2_df = pd.DataFrame(data=res, columns=[train_labels.columns[11]])

task2_df.to_csv('subtask2.csv', index=False, header=True)

t2 = time.time()
print('subtask2, time taken: ', t2 - t1)
print(task2_df)

#subtask 3
t1 = time.time()
reg = HistGradientBoostingRegressor(random_state=1510)

res = np.empty(shape=(len(patient_test_data), 4))
for feat in range(12, 16):
    reg.fit(patient_train_data, train_labels.to_numpy()[:, feat])
    res[:, feat - 12] = reg.predict(patient_test_data)

task3_df = pd.DataFrame(data=res, columns=train_labels.columns[12:])

task3_df.to_csv('subtask3.csv', index=False, header=True)

t2 = time.time()
print('subtask3, time taken: ', t2 - t1)
print(task3_df)

#task1_df = pd.read_csv('subtask1.csv')
#task2_df = pd.read_csv('subtask2.csv')
#task3_df = pd.read_csv('subtask3.csv')

#combine results
pids = pd.DataFrame(data=test_features.pid.unique(), columns=['pid'])
Exemple #29
0
    predictors_train = train_prepared.iloc[:,:-1]
    labels_train     = np.array(train_prepared.iloc[:,-1:]).flatten()
    predictors_test  = test_prepared.iloc[:,:-1]
    labels_test      = np.array(test_prepared.iloc[:,-1:]).flatten()

    # Model
    # model = BayesianRidge(alpha_1 = 5.661182937742398, alpha_2 = 8.158544161338462, lambda_1 = 7.509288525874375, lambda_2 = 0.08383802954777253)
    model = HistGradientBoostingRegressor(l2_regularization=0.1923237939031256, learning_rate=0.10551346041298326,
                                              loss='least_absolute_deviation', max_depth=4, max_leaf_nodes=32,
                                              min_samples_leaf=4, warm_start=False)
    # model = HistGradientBoostingRegressor(l2_regularization=0.02021888460670551, learning_rate=0.04277282248041758,
    #                                           loss='least_squares', max_depth=4, max_leaf_nodes=32, min_samples_leaf=16,
    #                                           warm_start=True)
    # model = SVR(C=2.9468542209755357, coef0=-0.6868465520687694, degree=4, epsilon=0.18702907953343395, gamma=0.1632449384464454, kernel='rbf', shrinking=True)
    model.fit(predictors_train, labels_train)
    yhat = model.predict(predictors_test)

    # Linear regression
    # scatter(yhat, labels_test)
    # plt.xlabel("Prediction ML", fontsize=16)
    # plt.ylabel("log(Visitation Rate)", fontsize=16)
    m, b = np.polyfit(yhat, labels_test, 1)
    # plot(yhat, yhat)
    X_reg, y_reg = yhat.reshape(-1, 1), labels_test.reshape(-1, 1)
    reg = LinearRegression().fit(X_reg, y_reg)
    reg.score(X_reg, y_reg)

    # Density difference (observed-predicted), organic vs not-organic
    kwargs = dict(hist_kws={'alpha': .4}, kde_kws={'linewidth': 1})
    plt.figure()
    df = pd.DataFrame({'obs':labels_test, 'pred':yhat, 'is_organic':[ x == 3 for x in test_management.management ]})
Exemple #30
0
    sys.path.append(project_path)
    ###
    with open('car_price_feat.txt') as f:
        feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))
    ###
    data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv')
    data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv')
    ###
    series_name = '宝马5系'
    d_train = data_train[data_train.model_series == series_name]
    d_test = data_test[data_test.model_series == series_name]
    ###
    label_encode_map, f_map = DataProcess.gencode(
        pd.concat([data_train, data_test]), feat_list)
    en_train, en_test = DataProcess.encode_process(
        d_train[feat_list], feat_list,
        label_encode_map), DataProcess.encode_process(d_test[feat_list],
                                                      feat_list,
                                                      label_encode_map)
    ####
    est = HistGradientBoostingRegressor(max_iter=200,
                                        learning_rate=0.3,
                                        max_depth=6,
                                        min_samples_leaf=20,
                                        max_leaf_nodes=40)
    est.fit(en_train, d_train.price)
    pred = est.predict(en_test)
    evaluate(d_test, pred)
    ### R2
    print(est.score(en_test, d_test.price))
    def train(self):
        write_save_log("start to train")
        correct_answer = []
        first_attempt_index = list(self.columns).index('Correct First Attempt')
        for row in self.train_data.values:
            re_cor = row[first_attempt_index]
            if np.isnan(re_cor):
                re_cor = 0
            correct_answer.append(re_cor)

        correct_answer = np.array(correct_answer)
        features = pd.read_csv("./data/feature.csv")

        # for col in features.columns:
        #     print(features[col].describe())

        with open('./data/intelligent_table.json', 'r') as f:
            intelligent_table = json.loads(f.read())

        with open('./data/kc_difficulty.json', 'r') as f:
            kc_table = json.loads(f.read())

        with open('./data/problem.json', 'r') as f:
            problem_table = json.loads(f.read())

        # generate feature for test dataa

        test_features_pd = pd.DataFrame()

        problem_unit = []
        problem_section = []
        problem_values = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            unit = dict_re["Problem Hierarchy"].split(", ")[0]
            unit = re.sub("Unit ", "", unit)
            section = dict_re["Problem Hierarchy"].split(", ")[1]
            section = re.sub("Section ", "", section)
            problem_unit.append(unit)
            problem_section.append(section)
            if dict_re["Step Name"] in problem_table.keys():
                problem_values.append(problem_table[dict_re["Step Name"]])
            else:
                problem_values.append(problem_table['mean'])
        self.test_data["Problem Unit"] = problem_unit
        self.test_data["Problem Section"] = problem_section

        # one hot encoder ID
        self.hash_encoder_generator(test_features_pd, "Anon Student Id",
                                    self.test_data)
        # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data)
        write_save_log("ID feature generated")
        # hash encoder Problem Name
        self.hash_encoder_generator(test_features_pd, "Problem Name",
                                    self.test_data)
        write_save_log("Problem Name feature generated")

        # hash encoder Problem Unit
        self.hash_encoder_generator(test_features_pd, "Problem Unit",
                                    self.test_data)
        write_save_log("Problem Unit feature generated")

        # hash encoder Problem Section
        self.hash_encoder_generator(test_features_pd, "Problem Section",
                                    self.test_data)
        write_save_log("Problem Section feature generated")

        # directly add problem view
        test_features_pd["Problem View"] = self.test_data["Problem View"]

        self.hash_encoder_generator(test_features_pd, "Step Name",
                                    self.test_data)

        intel_values = []
        kc_values = []
        test_answer = []
        kc_length = []
        index_count = 0
        remove_list = []
        oppo_feature = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            if np.isnan(dict_re["Correct First Attempt"]):
                remove_list.append(index_count)
            else:
                test_answer.append(dict_re["Correct First Attempt"])
            intel_values.append(intelligent_table[dict_re["Anon Student Id"]])

            stu_kc = dict_re["KC(Default)"]
            sum_difficult = 0
            kc_num = 0
            oppo_value = 0
            if type(stu_kc) == str:
                oppo_list = dict_re["Opportunity(Default)"].split("~~")
                kc_num = len(stu_kc.split("~~"))
                for true_kc in stu_kc.split("~~"):
                    oppo_value += int(oppo_list[stu_kc.split("~~").index(
                        true_kc)]) * kc_table[true_kc]
                    sum_difficult += kc_table[true_kc]
                sum_difficult /= len(stu_kc.split("~~"))
            else:
                oppo_value = kc_table["mean"]
                sum_difficult = kc_table["mean"]
            kc_values.append(sum_difficult)
            kc_length.append(kc_num)
            oppo_feature.append(oppo_value)

            index_count += 1

        test_features_pd["kc difficulty"] = kc_values
        test_features_pd["kc number"] = kc_length
        test_features_pd["person_intelligent"] = intel_values
        test_features_pd["oppo value"] = oppo_feature
        test_features_pd['Problem difficulty'] = problem_values
        test_features_pd.drop(remove_list, inplace=True)

        parameter_range = {
            "random_state": [i for i in range(0, 40)],
            "max_iter": [i for i in range(100, 500)],
            "loss": ['least_squares', 'least_absolute_deviation', 'poisson'],
            "learning_rate": [0.1 * i for i in range(1, 7)],
            "l2_regularization": [0.1 * i for i in range(1, 10)],
        }
        best_score = 1
        bes_policy = {}
        while best_score > 0.35:
            random_state = {}
            for key, value in parameter_range.items():
                random_state[key] = random.sample(value, 1)
            write_save_log(random_state)

            # clf1 = HistGradientBoostingRegressor()
            # clf2 = AdaBoostRegressor()
            #
            # clf = VotingRegressor(estimators=[('hgb', clf1), ('rf', clf2)], weights=[2, 1])

            clf = HistGradientBoostingRegressor(
                random_state=random_state["random_state"][0],
                max_iter=random_state["max_iter"][0],
                loss=random_state['loss'][0],
                learning_rate=random_state['learning_rate'][0],
                l2_regularization=random_state['l2_regularization'][0])

            clf.fit(features.values, correct_answer)

            for i in range(len(test_features_pd.columns)):
                if test_features_pd.columns[i] != features.columns[i]:
                    raise KeyError("feature order error!")

            res = clf.predict(test_features_pd.values)
            re_res = []
            for i in res:
                if i >= 0.5:
                    re_res.append(1)
                else:
                    re_res.append(0)

            re_score = MSER(re_res, test_answer)

            write_save_log("result error: {}".format(re_score))
            if best_score > re_score:
                best_score = re_score
                bes_policy = copy.deepcopy(random_state)
            write_save_log("\nbest policy and score\n" + str(bes_policy))
            write_save_log(str(best_score) + '\n')