Exemple #1
0
def GBM_grid_search(X_train, y_train, X_val, y_val):

    parameters = {
        'max_depth': 40,
        'min_samples_leaf': 1,
        'learning_rate': 0.01
    }
    param_grid = {
        'loss': [  #'poisson', 
            'least_squares', 'least_absolute_deviation'
        ]
    }

    GradientBoostingRegressorObject = HistGradientBoostingRegressor(
        random_state=1, **parameters)

    best_score = float('-inf')
    for g in ParameterGrid(param_grid):
        GradientBoostingRegressorObject.set_params(**g)
        GradientBoostingRegressorObject.fit(X_train, y_train)
        # save if best
        if GradientBoostingRegressorObject.score(X_val, y_val) > best_score:
            best_score = GradientBoostingRegressorObject.score(X_val, y_val)
            best_grid = g

    return (best_grid['loss'])
def test_missing_values_resilience(problem, missing_proportion,
                                   expected_min_score_classification,
                                   expected_min_score_regression):
    # Make sure the estimators can deal with missing values and still yield
    # decent predictions

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 2
    if problem == 'regression':
        X, y = make_regression(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=n_features,
                               random_state=rng)
        gb = HistGradientBoostingRegressor()
        expected_min_score = expected_min_score_regression
    else:
        X, y = make_classification(n_samples=n_samples,
                                   n_features=n_features,
                                   n_informative=n_features,
                                   n_redundant=0,
                                   n_repeated=0,
                                   random_state=rng)
        gb = HistGradientBoostingClassifier()
        expected_min_score = expected_min_score_classification

    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
    X[mask] = np.nan

    gb.fit(X, y)

    assert gb.score(X, y) > expected_min_score
def test_least_absolute_deviation():
    # For coverage only.
    X, y = make_regression(n_samples=500, random_state=0)
    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
                                         random_state=0)
    gbdt.fit(X, y)
    assert gbdt.score(X, y) > .9
Exemple #4
0
    sys.path.append(project_path)
    ###
    with open('car_price_feat.txt') as f:
        feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))
    ###
    data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv')
    data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv')
    ###
    series_name = '宝马5系'
    d_train = data_train[data_train.model_series == series_name]
    d_test = data_test[data_test.model_series == series_name]
    ###
    label_encode_map, f_map = DataProcess.gencode(
        pd.concat([data_train, data_test]), feat_list)
    en_train, en_test = DataProcess.encode_process(
        d_train[feat_list], feat_list,
        label_encode_map), DataProcess.encode_process(d_test[feat_list],
                                                      feat_list,
                                                      label_encode_map)
    ####
    est = HistGradientBoostingRegressor(max_iter=200,
                                        learning_rate=0.3,
                                        max_depth=6,
                                        min_samples_leaf=20,
                                        max_leaf_nodes=40)
    est.fit(en_train, d_train.price)
    pred = est.predict(en_test)
    evaluate(d_test, pred)
    ### R2
    print(est.score(en_test, d_test.price))
Exemple #5
0
def test_missing_values_minmax_imputation():
    # Compare the buit-in missing value handling of Histogram GBC with an
    # a-priori missing value imputation strategy that should yield the same
    # results in terms of decision function.
    #
    # Each feature (containing NaNs) is replaced by 2 features:
    # - one where the nans are replaced by min(feature) - 1
    # - one where the nans are replaced by max(feature) + 1
    # A split where nans go to the left has an equivalent split in the
    # first (min) feature, and a split where nans go to the right has an
    # equivalent split in the second (max) feature.
    #
    # Assuming the data is such that there is never a tie to select the best
    # feature to split on during training, the learned decision trees should be
    # strictly equivalent (learn a sequence of splits that encode the same
    # decision function).
    #
    # The MinMaxImputer transformer is meant to be a toy implementation of the
    # "Missing In Attributes" (MIA) missing value handling for decision trees
    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
    # The implementation of MIA as an imputation transformer was suggested by
    # "Remark 3" in :arxiv:'<1902.06931>`

    class MinMaxImputer(TransformerMixin, BaseEstimator):
        def fit(self, X, y=None):
            mm = MinMaxScaler().fit(X)
            self.data_min_ = mm.data_min_
            self.data_max_ = mm.data_max_
            return self

        def transform(self, X):
            X_min, X_max = X.copy(), X.copy()

            for feature_idx in range(X.shape[1]):
                nan_mask = np.isnan(X[:, feature_idx])
                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1

            return np.concatenate([X_min, X_max], axis=1)

    def make_missing_value_data(n_samples=int(1e4), seed=0):
        rng = np.random.RandomState(seed)
        X, y = make_regression(n_samples=n_samples,
                               n_features=4,
                               random_state=rng)

        # Pre-bin the data to ensure a deterministic handling by the 2
        # strategies and also make it easier to insert np.nan in a structured
        # way:
        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)

        # First feature has missing values completely at random:
        rnd_mask = rng.rand(X.shape[0]) > 0.9
        X[rnd_mask, 0] = np.nan

        # Second and third features have missing values for extreme values
        # (censoring missingness):
        low_mask = X[:, 1] == 0
        X[low_mask, 1] = np.nan

        high_mask = X[:, 2] == X[:, 2].max()
        X[high_mask, 2] = np.nan

        # Make the last feature nan pattern very informative:
        y_max = np.percentile(y, 70)
        y_max_mask = y >= y_max
        y[y_max_mask] = y_max
        X[y_max_mask, 3] = np.nan

        # Check that there is at least one missing value in each feature:
        for feature_idx in range(X.shape[1]):
            assert any(np.isnan(X[:, feature_idx]))

        # Let's use a test set to check that the learned decision function is
        # the same as evaluated on unseen data. Otherwise it could just be the
        # case that we find two independent ways to overfit the training set.
        return train_test_split(X, y, random_state=rng)

    # n_samples need to be large enough to minimize the likelihood of having
    # several candidate splits with the same gain value in a given tree.
    X_train, X_test, y_train, y_test = make_missing_value_data(
        n_samples=int(1e4), seed=0)

    # Use a small number of leaf nodes and iterations so as to keep
    # under-fitting models to minimize the likelihood of ties when training the
    # model.
    gbm1 = HistGradientBoostingRegressor(max_iter=100,
                                         max_leaf_nodes=5,
                                         random_state=0)
    gbm1.fit(X_train, y_train)

    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
    gbm2.fit(X_train, y_train)

    # Check that the model reach the same score:
    assert gbm1.score(X_train,
                      y_train) == pytest.approx(gbm2.score(X_train, y_train))

    assert gbm1.score(X_test,
                      y_test) == pytest.approx(gbm2.score(X_test, y_test))

    # Check the individual prediction match as a finer grained
    # decision function check.
    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
Exemple #6
0
def test_absolute_error():
    # For coverage only.
    X, y = make_regression(n_samples=500, random_state=0)
    gbdt = HistGradientBoostingRegressor(loss="absolute_error", random_state=0)
    gbdt.fit(X, y)
    assert gbdt.score(X, y) > 0.9

def EvaluateInput(index):
    global Correct, Wrong
    if (index < Forecaster.window_size):
        return "Please Choose an index with enough Windows Values"
    CorrectNextCP = Forecaster.dataset[index][2]
    CurrentCP = Forecaster.dataset[index - 1][2]
    CorrectAction = Forecaster.ActualOutput(CurrentCP, CorrectNextCP)
    InputToRegressor = Forecaster.dataset[index - 10:index]

    FlattenedInput = np.reshape(
        InputToRegressor,
        (1, InputToRegressor.shape[0] * InputToRegressor.shape[1]))
    Result = est.predict(FlattenedInput)

    PredictedAction = Forecaster.ActualOutput(CurrentCP, Result)

    if (PredictedAction == CorrectAction):
        print("Correct")
        Correct += 1
    else:
        print("Wrong")
        Wrong += 1


print(est.score(Forecaster.FlattenedXInput, Forecaster.YInput))

EvaluateInput(10)
print(Correct / (Correct + Wrong))
#!/usr/bin/env python
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))
             'for the California housing dataset, with MLPRegressor')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

##############################################################################
# Partial Dependence computation for Gradient Boosting
# ----------------------------------------------------
#
# Let's now fit a GradientBoostingRegressor and compute the partial dependence
# plots either or one or two variables at a time.

print("Training GradientBoostingRegressor...")
tic = time()
est = HistGradientBoostingRegressor()
est.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))
print("Test R2 score: {:.2f}".format(est.score(X_test, y_test)))

##############################################################################
# Here, we used the default hyperparameters for the gradient boosting model
# without any preprocessing as tree-based models are naturally robust to
# monotonic transformations of numerical features.
#
# Note that on this tabular dataset, Gradient Boosting Machines are both
# significantly faster to train and more accurate than neural networks. It is
# also significantly cheaper to tune their hyperparameters (the default tend to
# work well while this is not often the case for neural networks).
#
# Finally, as we will see next, computing partial dependence plots tree-based
# models is also orders of magnitude faster making it cheap to compute partial
# dependence plots for pairs of interacting features:
Exemple #10
0
#
# Below we will give an example of a large dataset and we can compare
# computation time with the earlier experiment in the previous section.

# %%
from time import time
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

histogram_gradient_boosting = HistGradientBoostingRegressor(max_iter=200,
                                                            random_state=0)

start_time = time()
histogram_gradient_boosting.fit(X_train, y_train)
fit_time_histogram_gradient_boosting = time() - start_time

start_time = time()
score_histogram_gradient_boosting = histogram_gradient_boosting.score(
    X_test, y_test)
score_time_histogram_gradient_boosting = time() - start_time

print("Historgram gradient boosting decision tree")
print(f"R2 score: {score_histogram_gradient_boosting:.3f}")
print(f"Fit time: {fit_time_histogram_gradient_boosting:.2f} s")
print(f"Score time: {score_time_histogram_gradient_boosting:.5f} s\n")

# %% [markdown]
# The histogram gradient-boosting is the best algorithm in terms of score.
# It will also scale when the number of samples increases, while the normal
# gradient-boosting will not.
Exemple #11
0
# Алгоритм №5: градиентный бустинг
reg5 = GradientBoostingRegressor(n_estimators = 500, max_features = 'auto', random_state = 42)
reg5.fit(X_train, y_train)

train_score_5 = reg5.score(X_train, y_train)
test_score_5 = reg5.score(X_test, y_test)

print('Score on train: ' + str(train_score_5)) # Score on train: 0.843
print('Score on test: ' + str(test_score_5)) # Score on test: 0.606


# Алгоритм №6: скоростной градиентный бустинг
reg6 = HistGradientBoostingRegressor(max_iter = 500, random_state = 42)
reg6.fit(X_train, y_train)

train_score_6 = reg6.score(X_train, y_train)
test_score_6 = reg6.score(X_test, y_test)

print('Score on train: ' + str(train_score_6)) # Score on train: 0.993
print('Score on test: ' + str(test_score_6)) # Score on test: 0.674


# Пройдёмся по разным глубинам дерева решений градиентного бустинга
all_time = datetime.now()

hgb_scores = {}
for i in range(2, 31):
    timer = datetime.now()
    print(i)
    reg6 = HistGradientBoostingRegressor(max_depth = i, max_iter = 500, random_state = 42)
    reg6.fit(X_train, y_train)
Exemple #12
0
def modeling_compare(X, y):
    import pandas as pd
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import Ridge
    from sklearn.linear_model import RidgeCV
    from sklearn.model_selection import RepeatedKFold
    from sklearn.linear_model import ElasticNet
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import PoissonRegressor
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingRegressor
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import SGDRegressor
    from sklearn.neural_network import MLPClassifier
    from sklearn.ensemble import VotingRegressor

    models_lab = [
        'Linear Regression', 'Ridge', 'Ridge with tuning hyperparameters',
        'Elastic Net', 'Random Forest', 'Poisson Regression',
        'Gradient Boosting regression', 'Lasso', 'Stochastic Gradient Descent',
        'Neural Network', 'Voting Regression'
    ]

    reg1 = LinearRegression().fit(X, y)
    reg2 = Ridge().fit(X, y)
    reg3 = Ridge(alpha=0.2).fit(X, y)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid = dict()
    grid['alpha'] = arange(0, 1, 0.01)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    reg3 = RidgeCV(alphas=arange(0, 1, 0.01),
                   cv=cv,
                   scoring='neg_mean_absolute_error').fit(X, y)
    reg4 = ElasticNet().fit(X, y)
    reg5 = RandomForestRegressor().fit(X, y)
    reg6 = PoissonRegressor().fit(X, y)
    reg7 = HistGradientBoostingRegressor(loss='poisson',
                                         learning_rate=.01).fit(X, y)
    reg8 = Lasso().fit(X, y)
    reg9 = SGDRegressor(loss='squared_loss', penalty='l2').fit(X, y)
    reg10 = MLPClassifier(solver='lbfgs',
                          alpha=1e-5,
                          hidden_layer_sizes=(17, 10),
                          random_state=1).fit(X, y)

    # VotingRegressor without NN
    ereg = VotingRegressor(estimators=[('lr', reg1), ('rd', reg2), (
        'rs',
        reg3), ('en',
                reg4), ('rf',
                        reg5), ('pr',
                                reg6), ('gb',
                                        reg7), ('ls',
                                                reg8), ('gd',
                                                        reg9)]).fit(X, y)

    models_obj = [
        reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, ereg
    ]

    score = [
        reg1.score(X, y),
        reg2.score(X, y),
        reg3.score(X, y),
        reg4.score(X, y),
        reg5.score(X, y),
        reg6.score(X, y),
        reg7.score(X, y),
        reg8.score(X, y),
        reg9.score(X, y),
        reg10.score(X, y),
        ereg.score(X, y)
    ]

    score_df = pd.DataFrame()
    score_df['models_lab'] = models_lab
    score_df['models_obj'] = models_obj
    score_df['score'] = score

    return (score_df)
from sklearn.inspection import plot_partial_dependence
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.datasets._california_housing import fetch_california_housing

df = fetch_california_housing(as_frame=True)
print(df)
x = df['data']
y = df['target']

est = HistGradientBoostingRegressor().fit(x, y)
est.score(x, y)
features = ['HouseAge', 'HouseAge', ['MedInc', 'HouseAge']]
plot_partial_dependence(est, x, features=features)
"""We can clearly see an interaction between the two features: 
for an Median income > 4.5, the House price is Dependent on HouseAge, 
for MedIncome  <  4.5 NO STRONG depedence HousePricing and HouseAge.
Makes sense ! Many Rich people Create more brand new Houses so they affect the House prices.
if we lived in Sao Paolo the very low income will not affect the  Relationship(Prices, Population)
because everyone is poor. 
"""

from matplotlib import pyplot as plt
plt.gca()
plt.show()
"""Disadvantages of PDP:
- The realistic maximum number of features in a partial dependence function is two.
- The assumption of independence is the biggest issue with PD plots. It is assumed that the feature(s) for 
which the partial dependence is computed are not correlated with other features.
One solution to this problem is Accumulated Local Effect plots or short ALE plots 
that work with the conditional instead of the marginal distribution.
-By plotting the individual conditional expectation curves instead of the aggregated line,