コード例 #1
0
def run(argv=None):
    """Emulate a HP search and monitor fit time."""
    args = parser.parse_args(argv)

    imputers = {
        'Mean': SimpleImputer(strategy='mean'),
        'Mean+mask': SimpleImputer(strategy='mean', add_indicator=True),
        'Med': SimpleImputer(strategy='median'),
        'Med+mask': SimpleImputer(strategy='median', add_indicator=True),
        'Iterative': IterativeImputer(max_iter=args.max_iter),
        'Iterative+mask': IterativeImputer(add_indicator=True,
                                           max_iter=args.max_iter),
        'IterativeR': IterativeImputer(estimator=RidgeCV(),
                                       max_iter=args.max_iter),
        'IterativeR+mask': IterativeImputer(estimator=RidgeCV(),
                                            add_indicator=True,
                                            max_iter=args.max_iter),
        'KNN': KNNImputer(),
        'KNN+mask': KNNImputer(add_indicator=True),

    }

    task_name = args.task_name
    est = args.est
    imp = imputers.get(args.imp, None)

    if task_name is None or est is None:
        logger.info('No argv given.')
        task_name = 'TB/shock_hemo'
        est = 'HGBC'

    task = tasks[task_name]
    logger.info(f'Argv given. Task {task.meta.tag}. est {est}.')

    t0 = time()
    logger.info('Getting X.')
    X = task.X
    logger.info('Getting y.')
    y = task.y

    logger.info(f'X shape before splits: {X.shape}')

    # Simulate the outer CV (the one of KFold)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    # Simulate the inner CV (the one of RandomSearchCV)
    X_train2, X_test2, y_train2, _ = train_test_split(X_train, y_train, test_size=0.2)

    # Now X has the same shape as in real experiment
    logger.info(f'X shape: {X_train2.shape}')

    t_X_ready = time()

    if imp is not None:
        logger.info(f'Fitting imputer {args.imp}')
        imp.fit(X_train2, y_train2)
        t_fit_imp = time()
        logger.info('Imputer fitted.')

        logger.info('Transforming X_train')
        imp.transform(X_train2)
        t_tra1_imp = time()
        logger.info('X_train transformed')

        logger.info('Transforming X_test')
        imp.transform(X_test2)
        t_tra2_imp = time()
        logger.info('X_test transformed')

    t_fits = [time()]

    for learning_rate in param_space['learning_rate']:
        for max_depth in param_space['max_depth']:
            if est == 'HGBC':
                estimator = HistGradientBoostingClassifier(
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            elif est == 'HGBR':
                estimator = HistGradientBoostingRegressor(
                    loss='least_absolute_deviation',
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            else:
                raise ValueError(f'Unknown estimator {est}')

            logger.info(f'Params: LR {learning_rate} MD {max_depth}')
            logger.info('Fitting estimator.')
            estimator.fit(X_train2, y_train2)
            t_fits.append(time())
            logger.info('Estimator fitted.')

    t_fits = np.diff(t_fits)

    data = {
        'task_tag': [task.meta.tag],
        'imp': [args.imp],
        'imp_params': [repr({'max_iter': args.max_iter})],
        'X_shape': [repr(X.shape)],
        'X_train_shape': [repr(X_train2.shape)],
        'X_test_shape': [repr(X_test2.shape)],
        'time_X_ready': [t_X_ready-t0],
        'time_fit_imp': np.around([0 if imp is None else t_fit_imp-t_X_ready], 2),
        'time_tra1_imp': np.around([0 if imp is None else t_tra1_imp-t_X_ready], 2),
        'time_tra2_imp': np.around([0 if imp is None else t_tra2_imp-t_tra1_imp], 2),
        'time_fits': [repr(np.around(t_fits.tolist(), 2))],
        'time_fits_mean': [np.around(t_fits.mean(), 2)]
    }

    new_df = pd.DataFrame(data)

    df = None
    filepath = 'results/fit_time.csv'
    if os.path.exists(filepath):
        df = pd.read_csv(filepath, index_col=0)

    if df is not None:
        new_df = pd.concat([df, new_df])

    new_df.to_csv(filepath)
コード例 #2
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure sklearn has the same predictions as lightgbm for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and sklearn should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.
    # - We don't check the least_absolute_deviation loss here. This is because
    #   LightGBM's computation of the median (used for the initial value of
    #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
    #   is no need to.). Since these tests only run 1 iteration, the
    #   discrepancy between the initial values leads to biggish differences in
    #   the predictions. These differences are much smaller with more
    #   iterations.
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingRegressor(
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
コード例 #3
0
# class even more optimized for large dataset called
# `HistGradientBoostingClassifier` and `HistGradientBoostingRegressor`. Each
# feature in the dataset `X` is first binned by computing histograms which are
# later used to evaluate the potential splits. The number of splits to evaluate
# is then much smaller. This algorithm becomes much more efficient than
# gradient boosting when the dataset has 10,000+ samples.
#
# Below we will give an example of a large dataset and we can compare
# computation time with the earlier experiment in the previous section.

# %%
from time import time
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

histogram_gradient_boosting = HistGradientBoostingRegressor(max_iter=200,
                                                            random_state=0)

start_time = time()
histogram_gradient_boosting.fit(X_train, y_train)
fit_time_histogram_gradient_boosting = time() - start_time

start_time = time()
score_histogram_gradient_boosting = histogram_gradient_boosting.score(
    X_test, y_test)
score_time_histogram_gradient_boosting = time() - start_time

print("Historgram gradient boosting decision tree")
print(f"R2 score: {score_histogram_gradient_boosting:.3f}")
print(f"Fit time: {fit_time_histogram_gradient_boosting:.2f} s")
print(f"Score time: {score_time_histogram_gradient_boosting:.5f} s\n")
コード例 #4
0
def test_poisson_y_positive(y):
    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
    gbdt = HistGradientBoostingRegressor(loss="poisson", random_state=0)
    with pytest.raises(ValueError, match=err_msg):
        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
コード例 #5
0
pipe_KN = Pipeline([('scl', StandardScaler()),
                    ('est', KNeighborsRegressor())])

#pipe_KN_PCA = Pipeline([('scl', StandardScaler()),
#                        ('pca', PCA(n_components=2)),
#                        ('est', KNeighborsRegressor())])

pipe_GB = Pipeline([('scl', StandardScaler()),
                    ('est', GradientBoostingRegressor(random_state=seed))])

#pipe_GB_PCA = Pipeline([('scl', StandardScaler()),
#                        ('pca', PCA(n_components=2)),
#                        ('est', GradientBoostingRegressor(random_state=seed))])

pipe_HGB = Pipeline([('scl', StandardScaler()),
                     ('est', HistGradientBoostingRegressor(random_state=seed))])

#pipe_HGB_PCA = Pipeline([('scl', StandardScaler()),
#                         ('pca', PCA(n_components=2)),
#                         ('est', HistGradientBoostingRegressor(random_state=seed))])

pipe_B = Pipeline([('scl', StandardScaler()),
                   ('est', BaggingRegressor(random_state=seed))])

#pipe_B_PCA = Pipeline([('scl', StandardScaler()),
#                       ('pca', PCA(n_components=2)),
#                       ('est', BaggingRegressor(random_state=seed))])

# Set grid search params
grid_params_DT = [{'est__criterion': ('mse', 'friedman_mse', 'mae'),
                   'est__splitter': ('best', 'random'),
コード例 #6
0
    'for the California housing dataset, with MLPRegressor')
display.figure_.subplots_adjust(hspace=0.3)

# %%
# Gradient boosting
# .................
#
# Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and
# compute the partial dependence on the same features.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

print("Training HistGradientBoostingRegressor...")
tic = time()
est = HistGradientBoostingRegressor()
est.fit(X_train, y_train)
print(f"done in {time() - tic:.3f}s")
print(f"Test R2 score: {est.score(X_test, y_test):.2f}")

# %%
# Here, we used the default hyperparameters for the gradient boosting model
# without any preprocessing as tree-based models are naturally robust to
# monotonic transformations of numerical features.
#
# Note that on this tabular dataset, Gradient Boosting Machines are both
# significantly faster to train and more accurate than neural networks. It is
# also significantly cheaper to tune their hyperparameters (the defaults tend
# to work well while this is not often the case for neural networks).
#
# We will plot the partial dependence, both individual (ICE) and averaged one
コード例 #7
0
# the prediction performance of each individual model.
#
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
# to combine their outputs together.

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

estimators = [
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Lasso', LassoCV()),
    ('Gradient Boosting', HistGradientBoostingRegressor(random_state=0))
]
stacking_regressor = StackingRegressor(
    estimators=estimators, final_estimator=RidgeCV()
)


###############################################################################
# We used the Boston data set (prediction of house prices). We check the
# performance of each individual predictor as well as the stack of the
# regressors.

import time
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_validate, cross_val_predict
コード例 #8
0
# 'poisson' loss as well.

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))

##############################################################################
# Rich visual representation of estimators
# -----------------------------------------
# Estimators can now be visualized in notebooks by enabling the
# `display='diagram'` option. This is particularly useful to summarise the
# structure of pipelines and other composite estimators, with interactivity to
# provide detail.  Click on the example image below to expand Pipeline
# elements.  See :ref:`visualizing_composite_estimators` for how you can use
# this feature.
コード例 #9
0
# df_test  = pd.read_csv("./data/test_3B.csv")

print('train:', df_train.shape)
print('test:', df_test.shape)

# extract the whole features for ML
X = df_train[df_train.columns[:-1]].copy()
y = df_train["SalePrice_Log"].copy()
final_test = df_test.copy()  # final test X

#%% Models with optimal params determined by 10-folds CV
lasso_opt = Lasso(alpha=0.00423)
ridge_opt = Ridge(alpha=1)
knn_opt = KNeighborsRegressor(n_neighbors=10, weights='distance')
rf = RandomForestRegressor(random_state=2)  # use default params: N=100
boost = HistGradientBoostingRegressor()  # default 'max_iter': 100
dt = DecisionTreeRegressor(random_state=5)  # use default params
SVMrbf_opt = SVR(kernel='rbf', C=1.0, gamma=0.01)
SVMli = SVR(kernel='linear')
SVMpoly_opt = SVR(kernel='poly', degree=3)

estimators_list = [('Lasso', lasso_opt), ('Ridge', ridge_opt),
                   ('KNN', knn_opt), ('RF', rf), ('DT', dt), ('Boost', boost),
                   ('SVMrbf', SVMrbf_opt), ('SVMli', SVMli),
                   ('SVMpoly', SVMpoly_opt)]

#%% do SFS-model stacking
start_time = time.time()
estimators_sel, RMSE_best, RMSE_all_steps = SFS_stack_models(estimators_list,
                                                             X,
                                                             y,
コード例 #10
0
    "Partial dependence of house value on non-location features\n"
    "for the California housing dataset, with MLPRegressor")
display.figure_.subplots_adjust(hspace=0.3)

# %%
# Gradient boosting
# .................
#
# Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and
# compute the partial dependence on the same features.

from sklearn.ensemble import HistGradientBoostingRegressor

print("Training HistGradientBoostingRegressor...")
tic = time()
est = HistGradientBoostingRegressor(random_state=0)
est.fit(X_train, y_train)
print(f"done in {time() - tic:.3f}s")
print(f"Test R2 score: {est.score(X_test, y_test):.2f}")

# %%
# Here, we used the default hyperparameters for the gradient boosting model
# without any preprocessing as tree-based models are naturally robust to
# monotonic transformations of numerical features.
#
# Note that on this tabular dataset, Gradient Boosting Machines are both
# significantly faster to train and more accurate than neural networks. It is
# also significantly cheaper to tune their hyperparameters (the defaults tend
# to work well while this is not often the case for neural networks).
#
# We will plot the partial dependence, both individual (ICE) and averaged one
# Gradient boosting estimator with dropped categorical features
# -------------------------------------------------------------
# As a baseline, we create an estimator where the categorical features are
# dropped:

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

dropper = make_column_transformer(
    ('drop', make_column_selector(dtype_include='category')),
    remainder='passthrough')
hist_dropped = make_pipeline(dropper,
                             HistGradientBoostingRegressor(random_state=42))

# %%
# Gradient boosting estimator with one-hot encoding
# -------------------------------------------------
# Next, we create a pipeline that will one-hot encode the categorical features
# and let the rest of the numerical data to passthrough:

from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'),
     make_column_selector(dtype_include='category')),
    remainder='passthrough')

hist_one_hot = make_pipeline(one_hot_encoder,
コード例 #12
0
categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]
ordinal_encoder = OrdinalEncoder(categories=categories)

gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
    ),
    HistGradientBoostingRegressor(categorical_features=range(4), ),
)

# %%
#
# Lets evaluate our gradient boosting model with the mean absolute error of the
# relative demand averaged accross our 5 time-based cross-validation splits:


def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=ts_cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
コード例 #13
0
"""Hydro_Model

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aMtPXxewSC8pS3Wp1Ay7Z8kthRyY-Ko6
"""

import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import pickle

data = pd.read_csv('clean.csv')

X = data.drop(['Unnamed: 0', 'ID', 'Date', 'WQI', 'Label'], axis=1)

Y = data['WQI']

X = np.array(X)
Y = np.array(Y)

model = HistGradientBoostingRegressor().fit(X, Y)
model.predict(x)

pkl_filename = "pickle_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
print("Model Trained and Saved")
コード例 #14
0
        #RMSE    ~  540 ($)
        #RUNTIME ~   20 (min)

    'RandomForestRegressor':RandomForestRegressor(),
        #RMSE    ~  551 ($)
        #RUNTIME ~   4 (min)

    ======================================== Experimental Models ========================================
"""

highest_score = 530

ESTIMATORS_STACK = [
    # to be used with StackingRegressor() estimator
    ('RandomForestRegressor', RandomForestRegressor()),
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor())
]

models = {
    'RandomForestRegressor': RandomForestRegressor(),
    #RMSE    ~  551 ($)
    #RUNTIME ~   4 (min)

    #'StackingRegressor':StackingRegressor(estimators=ESTIMATORS_STACK),
    #COMMENT: Can stack multiple estimators and generate a regression from it.
    #RMSE    ~  528 ($)
    #RUNTIME ~   30 (min)

    #'ARDRegression':ARDRegression(),
    #RMSE    ~ UNKOWN needs more RAM
    #RUNTIME ~ UNKOWN (min)
コード例 #15
0
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder


tree_preprocessor = ColumnTransformer(
    [
        ("categorical", OrdinalEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
        ("numeric", "passthrough",
            ["VehAge", "DrivAge", "BonusMalus", "Density"]),
    ],
    remainder="drop",
)
poisson_gbrt = Pipeline([
    ("preprocessor", tree_preprocessor),
    ("regressor", HistGradientBoostingRegressor(loss="poisson",
                                                max_leaf_nodes=128)),
])
poisson_gbrt.fit(df_train, df_train["Frequency"],
                 regressor__sample_weight=df_train["Exposure"])

print("Poisson Gradient Boosted Trees evaluation:")
score_estimator(poisson_gbrt, df_test)

# %%
# Like the Poisson GLM above, the gradient boosted trees model minimizes
# the Poisson deviance. However, because of a higher predictive power,
# it reaches lower values of Poisson deviance.
#
# Evaluating models with a single train / test split is prone to random
# fluctuations. If computing resources allow, it should be verified that
# cross-validated performance metrics would lead to similar conclusions.
コード例 #16
0
# top_5_precip = pd.concat([top_5_flow, p_data], axis=1, join='inner').drop('66401_0', axis = 1).dropna()

# top_5_train_labels = np.array(top_5['66401_0'])
# top_5_train_features = np.array(top_5.drop('66401_0', axis = 1))

# top_5_test_labels = np.array(top_5_flow[:date_cut_off])
# top_5_test_features = np.array(top_5_precip.loc[:date_cut_off])
# top_5_test_features_index = top_5_precip.loc[:date_cut_off].index

test_features = data1.loc[date_cut_off:]

test_features1 = np.array(test_features.drop(['66401_0', '66401_48'], axis=1))
test_features2 = np.array(test_features.drop('66401_0', axis=1))

rf = HistGradientBoostingRegressor(max_iter=100, random_state=42)
rf.fit(train_features1, train_labels)
predictions1 = rf.predict(test_features1)
predict1 = pd.Series(predictions1,
                     index=test_features.index,
                     name='100 HistGB Predicted Flow')
# median1 = actual1[actual1 > actual1.quantile(0.95)].median()
# predict3 = predict2[predict2 > median1]

# rf = HistGradientBoostingRegressor(max_iter = 200, random_state = 42)
# rf.fit(train_features1, train_labels)
# predictions2 = rf.predict(test_features1)
# predict2 = pd.Series(predictions2, index=test_features.index, name='200 HistGB Predicted Flow')

# rf = RandomForestRegressor(n_estimators = 200, random_state = None, n_jobs=3)
# rf.fit(top_5_train_features, top_5_train_labels)
コード例 #17
0
ファイル: parse.py プロジェクト: Catadanna/Kaggle_NCAAW_2020
    "AdaBoost_Regressor",
    "LogisticRegression",
    "TheilSen_Regressor",
    "Huber_Regressor",
    "CatBoost_Classifier",
    "CatBoost_Regressor",
]

classifiers = [
    RidgeClassifier(),
    RidgeClassifierCV(),
    XGBRegressor(),
    GradientBoostingClassifier(verbose=0),
    GradientBoostingRegressor(verbose=0),
    HistGradientBoostingClassifier(verbose=0),
    HistGradientBoostingRegressor(verbose=0),
    ExtraTreesClassifier(verbose=0),
    ExtraTreesRegressor(verbose=0),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=500, verbose=0),
    RandomForestRegressor(max_depth=5, n_estimators=500, verbose=0),
    AdaBoostClassifier(**PARAMS_ADABOOST),
    AdaBoostRegressor(**PARAMS_ADABOOST),
    LogisticRegression(max_iter=10000, verbose=0),
    TheilSenRegressor(verbose=False),
    HuberRegressor(),
    CatBoostClassifier(**PARAMS_CATBOOST),
    CatBoostRegressor(**PARAMS_CATBOOST_REGRESSOR),
]
コード例 #18
0
print(' %7s   %17.4f   %15.4f ' %
      ('MedAE:', round(medae_in, 2), round(medae_out, 2)))
print(' %7s   %17.4f   %15.4f ' %
      ('MSLE:', round(msle_in, 2), round(msle_out, 2)))
print(' %7s   %17.4f   %15.4f ' % ('rmspe', rmspe_in, rmspe_out))

#----------------------------------------------------------------------
# Treinar e testar um regressor HistGradientBoosting
#----------------------------------------------------------------------

print(' ')
print(' REGRESSOR HIST GRADIENT BOOSTING:')
print(' ')

hgb = HistGradientBoostingRegressor(l2_regularization=12.0,
                                    max_iter=70,
                                    learning_rate=0.1,
                                    loss='least_absolute_deviation')

hgb = hgb.fit(x_treino, y_treino)

y_resposta_treino = hgb.predict(x_treino)
y_resposta_teste = hgb.predict(x_teste)

print(' Métrica   DENTRO da amostra   FORA da amostra ')
print(' -------   -----------------   --------------- ')

mse_in = mean_squared_error(y_treino, y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in = r2_score(y_treino, y_resposta_treino)
medae_in = median_absolute_error(y_treino, y_resposta_treino)
msle_in = mean_squared_log_error(y_treino, y_resposta_treino)
コード例 #19
0
def create_estimators(
        preprocessing: ColumnTransformer) -> Dict[str, GridSearchCV]:
    """Create estimators for model fitting.

    ElasticNet, HistGradientBoost and RandomForest regressors are added. 
    More can be added.

    Args:
        preprocessing: Preprocessing Pipeline

    Returns:
        Key-value pairs of estimator name and instantiated estimator
    """
    # ElasticNet (alpha = 1.0 -> Lasso)
    param_grid = {
        "regressor__regressor__alpha": (0.001, 0.01, 0.1, 1.0),
        "regressor__regressor__l1_ratio": (0.05, 0.2, 0.5, 0.7, 0.9, 1.0),
    }
    en_pipe = TransformedTargetRegressor(
        regressor=Pipeline([("preprocessor", preprocessing),
                            ("regressor", ElasticNet())]),
        transformer=StandardScaler(),
    )
    en_search = GridSearchCV(
        en_pipe,
        param_grid=param_grid,
        cv=5,
    )

    # RandomForest
    param_grid = {
        "regressor__regressor__n_estimators": [50, 100, 200],
        "regressor__regressor__max_depth": [5, 6, 7, 15],
    }
    rf_pipe = TransformedTargetRegressor(
        regressor=Pipeline([("preprocessor", preprocessing),
                            ("regressor", RandomForestRegressor())]),
        transformer=StandardScaler(),
    )
    rf_search = GridSearchCV(
        rf_pipe,
        param_grid=param_grid,
        cv=5,
    )

    # HistGradientBoost
    param_grid = {
        "regressor__regressor__l2_regularization": [0.0, 0.1, 1.0],
        "regressor__regressor__max_depth": [6, 15],
        "regressor__regressor__max_iter": [100, 200],
    }
    hgb_pipe = TransformedTargetRegressor(
        regressor=Pipeline([
            ("preprocessor", preprocessing),
            ("regressor", HistGradientBoostingRegressor()),
        ]),
        transformer=StandardScaler(),
    )
    hgb_search = GridSearchCV(
        hgb_pipe,
        param_grid=param_grid,
        cv=5,
    )

    return {
        "ElasticNet": en_search,
        "RandomForest": rf_search,
        "HistGradientBoost": hgb_search,
    }
コード例 #20
0
import matplotlib.pyplot as plt

rng = np.random.RandomState(0)

n_samples = 5000
f_0 = rng.rand(n_samples)  # positive correlation with y
f_1 = rng.rand(n_samples)  # negative correlation with y
X = np.c_[f_0, f_1]
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(
    10 * np.pi * f_1) + noise

fig, ax = plt.subplots()

# Without any constraint
gbdt = HistGradientBoostingRegressor()
gbdt.fit(X, y)
disp = PartialDependenceDisplay.from_estimator(
    gbdt,
    X,
    features=[0, 1],
    line_kw={
        "linewidth": 4,
        "label": "unconstrained",
        "color": "tab:blue"
    },
    ax=ax,
)

# With positive and negative constraints
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
コード例 #21
0
def test_absolute_error():
    # For coverage only.
    X, y = make_regression(n_samples=500, random_state=0)
    gbdt = HistGradientBoostingRegressor(loss="absolute_error", random_state=0)
    gbdt.fit(X, y)
    assert gbdt.score(X, y) > 0.9
コード例 #22
0
# preprocessed output from the 3 learners.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

lasso_pipeline = make_pipeline(processor_lin, LassoCV())

rf_pipeline = make_pipeline(processor_nlin,
                            RandomForestRegressor(random_state=42))

gradient_pipeline = make_pipeline(
    processor_nlin, HistGradientBoostingRegressor(random_state=0))

estimators = [('Random Forest', rf_pipeline), ('Lasso', lasso_pipeline),
              ('Gradient Boosting', gradient_pipeline)]

stacking_regressor = StackingRegressor(estimators=estimators,
                                       final_estimator=RidgeCV())

# %%
# Measure and plot the results
##############################################################################
#
# Now we can use Ames Housing dataset to make the predictions. We check the
# performance of each individual predictor as well as of the stack of the
# regressors.
#
コード例 #23
0
def test_missing_values_minmax_imputation():
    # Compare the buit-in missing value handling of Histogram GBC with an
    # a-priori missing value imputation strategy that should yield the same
    # results in terms of decision function.
    #
    # Each feature (containing NaNs) is replaced by 2 features:
    # - one where the nans are replaced by min(feature) - 1
    # - one where the nans are replaced by max(feature) + 1
    # A split where nans go to the left has an equivalent split in the
    # first (min) feature, and a split where nans go to the right has an
    # equivalent split in the second (max) feature.
    #
    # Assuming the data is such that there is never a tie to select the best
    # feature to split on during training, the learned decision trees should be
    # strictly equivalent (learn a sequence of splits that encode the same
    # decision function).
    #
    # The MinMaxImputer transformer is meant to be a toy implementation of the
    # "Missing In Attributes" (MIA) missing value handling for decision trees
    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
    # The implementation of MIA as an imputation transformer was suggested by
    # "Remark 3" in https://arxiv.org/abs/1902.06931

    class MinMaxImputer(TransformerMixin, BaseEstimator):
        def fit(self, X, y=None):
            mm = MinMaxScaler().fit(X)
            self.data_min_ = mm.data_min_
            self.data_max_ = mm.data_max_
            return self

        def transform(self, X):
            X_min, X_max = X.copy(), X.copy()

            for feature_idx in range(X.shape[1]):
                nan_mask = np.isnan(X[:, feature_idx])
                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1

            return np.concatenate([X_min, X_max], axis=1)

    def make_missing_value_data(n_samples=int(1e4), seed=0):
        rng = np.random.RandomState(seed)
        X, y = make_regression(n_samples=n_samples,
                               n_features=4,
                               random_state=rng)

        # Pre-bin the data to ensure a deterministic handling by the 2
        # strategies and also make it easier to insert np.nan in a structured
        # way:
        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)

        # First feature has missing values completely at random:
        rnd_mask = rng.rand(X.shape[0]) > 0.9
        X[rnd_mask, 0] = np.nan

        # Second and third features have missing values for extreme values
        # (censoring missingness):
        low_mask = X[:, 1] == 0
        X[low_mask, 1] = np.nan

        high_mask = X[:, 2] == X[:, 2].max()
        X[high_mask, 2] = np.nan

        # Make the last feature nan pattern very informative:
        y_max = np.percentile(y, 70)
        y_max_mask = y >= y_max
        y[y_max_mask] = y_max
        X[y_max_mask, 3] = np.nan

        # Check that there is at least one missing value in each feature:
        for feature_idx in range(X.shape[1]):
            assert any(np.isnan(X[:, feature_idx]))

        # Let's use a test set to check that the learned decision function is
        # the same as evaluated on unseen data. Otherwise it could just be the
        # case that we find two independent ways to overfit the training set.
        return train_test_split(X, y, random_state=rng)

    # n_samples need to be large enough to minimize the likelihood of having
    # several candidate splits with the same gain value in a given tree.
    X_train, X_test, y_train, y_test = make_missing_value_data(
        n_samples=int(1e4), seed=0)

    # Use a small number of leaf nodes and iterations so as to keep
    # under-fitting models to minimize the likelihood of ties when training the
    # model.
    gbm1 = HistGradientBoostingRegressor(max_iter=100,
                                         max_leaf_nodes=5,
                                         random_state=0)
    gbm1.fit(X_train, y_train)

    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
    gbm2.fit(X_train, y_train)

    # Check that the model reach the same score:
    assert gbm1.score(X_train,
                      y_train) == pytest.approx(gbm2.score(X_train, y_train))

    assert gbm1.score(X_test,
                      y_test) == pytest.approx(gbm2.score(X_test, y_test))

    # Check the individual prediction match as a finer grained
    # decision function check.
    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
コード例 #24
0
    # parameters = {
    #     'learning_rate':[0.05, 0.10, 0.15, 0.20],
    #     'max_depth':[3, 4, 5, 6, 7, 8, 9], 
    #     'l2_regularization':[0.0, 0.0001, 0.001, 0.01, 0.1]
    # }

    # model = HistGradientBoostingClassifier(max_iter=300)

    # GDSCV = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='r2', n_jobs=-1)
    # GDSCV.fit(X_train, y_train)
    # print(GDSCV.best_params_)

    # cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    # print("Cross-validation score is {score:.3f},"
    #     " standard deviation is {err:.3f}"
    #     .format(score = cv_score.mean(), err = cv_score.std()))

    model = HistGradientBoostingRegressor(max_iter=300, l2_regularization=params[i][0], learning_rate=params[i][1], max_depth=int(params[i][2]))

    model = model.fit(X_train, y_train)
    y_pred_label = model.predict(X_test)
    y_pred[label] = y_pred_label

    print(label, ': finished! \n')

df_pred = pd.DataFrame(y_pred)

compression_options = dict(method='zip', archive_name='prediction.csv')
df_pred.to_csv('prediction.zip', index=False, float_format='%.3f', compression=compression_options)

print('All finished!')
コード例 #25
0
     (1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1")]
)
def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
    X = np.asarray([[1, 2], [3, 4]])
    with pytest.raises(ValueError, match=err_msg):
        _grid_from_X(
            X, grid_resolution=grid_resolution, percentiles=percentiles
        )


@pytest.mark.parametrize('target_feature', range(5))
@pytest.mark.parametrize('est, method', [
    (LinearRegression(), 'brute'),
    (GradientBoostingRegressor(random_state=0), 'brute'),
    (GradientBoostingRegressor(random_state=0), 'recursion'),
    (HistGradientBoostingRegressor(random_state=0), 'brute'),
    (HistGradientBoostingRegressor(random_state=0), 'recursion')]
)
def test_partial_dependence_helpers(est, method, target_feature):
    # Check that what is returned by _partial_dependence_brute or
    # _partial_dependence_recursion is equivalent to manually setting a target
    # feature to a given value, and computing the average prediction over all
    # samples.
    # This also checks that the brute and recursion methods give the same
    # output.
    # Note that even on the trainset, the brute and the recursion methods
    # aren't always strictly equivalent, in particular when the slow method
    # generates unrealistic samples that have low mass in the joint
    # distribution of the input features, and when some of the features are
    # dependent. Hence the high tolerance on the checks.
コード例 #26
0
#####################################
# Available optimisations on this machine.

from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation

print(code_optimisation())

########################################
# Training and converting a model
# +++++++++++++++++++++++++++++++

data = make_regression(50000, 20)
X, y = data
X_train, X_test, y_train, y_test = train_test_split(X, y)

hgb = HistGradientBoostingRegressor(max_iter=100, max_depth=6)
hgb.fit(X_train, y_train)
print(hgb)

########################################
# Let's get more statistics about the model itself.
pprint(analyze_model(hgb))

#################################
# And let's convert it.

register_rewritten_operators()
onx = to_onnx(hgb, X_train[:1].astype(numpy.float32))
oinf = OnnxInference(onx, runtime='python_compiled')
print(oinf)
コード例 #27
0
from sklearn.inspection import plot_partial_dependence
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.datasets._california_housing import fetch_california_housing

df = fetch_california_housing(as_frame=True)
print(df)
x = df['data']
y = df['target']

est = HistGradientBoostingRegressor().fit(x, y)
est.score(x, y)
features = ['HouseAge', 'HouseAge', ['MedInc', 'HouseAge']]
plot_partial_dependence(est, x, features=features)
"""We can clearly see an interaction between the two features: 
for an Median income > 4.5, the House price is Dependent on HouseAge, 
for MedIncome  <  4.5 NO STRONG depedence HousePricing and HouseAge.
Makes sense ! Many Rich people Create more brand new Houses so they affect the House prices.
if we lived in Sao Paolo the very low income will not affect the  Relationship(Prices, Population)
because everyone is poor. 
"""

from matplotlib import pyplot as plt
plt.gca()
plt.show()
"""Disadvantages of PDP:
- The realistic maximum number of features in a partial dependence function is two.
- The assumption of independence is the biggest issue with PD plots. It is assumed that the feature(s) for 
which the partial dependence is computed are not correlated with other features.
One solution to this problem is Accumulated Local Effect plots or short ALE plots 
that work with the conditional instead of the marginal distribution.
-By plotting the individual conditional expectation curves instead of the aggregated line,
コード例 #28
0
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

hist = HistGradientBoostingRegressor(random_state=42)
hist.fit(X_train, y_train)
hist_pred = hist.predict(X_test)

compute_metrics(y_test, hist_pred)

hist_poisson = HistGradientBoostingRegressor(loss='poisson', random_state=42)
hist_poisson.fit(X_train, y_train)

hist_poisson_pred = hist_poisson.predict(X_test)

compute_metrics(y_test, hist_poisson_pred)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
ax1.hist(y_test, bins=30, alpha=0.5)
ax1.set_title("Test data")
ax2.hist(hist_pred, bins=30, alpha=0.5)
ax2.set_title("Default Hist")
ax3.hist(hist_poisson_pred, bins=30, alpha=0.5)
ax3.set_title("Poisson Hist");
コード例 #29
0
def pick_algorithm():
    # Choose the right model
    pipelines = []
    pipelines.append(('ScaledLR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LR', LinearRegression())])))
    pipelines.append(('ScaledSGDR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('SGDR', SGDRegressor())])))
    pipelines.append(('ScaledHR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('HR', HuberRegressor())])))
    pipelines.append(('ScaledLARS',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LARS', Lars())])))
    pipelines.append(('ScaledLL',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LL', LassoLars())])))
    pipelines.append(('ScaledORP',
                      Pipeline([('Scaler', StandardScaler()),
                                ('ORP', OrthogonalMatchingPursuit())])))
    pipelines.append(('ScaledPAR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('PAR', PassiveAggressiveRegressor())])))
    pipelines.append(('ScaledBR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('BR', BayesianRidge())])))
    pipelines.append(('ScaledRIDGE',
                      Pipeline([('Scaler', StandardScaler()),
                                ('RIDGE', Ridge())])))
    pipelines.append(('ScaledLASSO',
                      Pipeline([('Scaler', StandardScaler()),
                                ('LASSO', Lasso())])))
    pipelines.append(('ScaledEN',
                      Pipeline([('Scaler', StandardScaler()),
                                ('EN', ElasticNet())])))
    pipelines.append(('ScaledKNN',
                      Pipeline([('Scaler', StandardScaler()),
                                ('KNN', KNeighborsRegressor())])))
    pipelines.append(('ScaledCART',
                      Pipeline([('Scaler', StandardScaler()),
                                ('CART', DecisionTreeRegressor())])))
    pipelines.append(('ScaledGBM',
                      Pipeline([('Scaler', StandardScaler()),
                                ('GBM', GradientBoostingRegressor())])))
    pipelines.append(('ScaledABR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('ABR', AdaBoostRegressor())])))
    pipelines.append(('ScaledBAR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('BAR', BaggingRegressor())])))
    pipelines.append(('ScaledHGBR',
                      Pipeline([('Scaler', StandardScaler()),
                                ('HGBR', HistGradientBoostingRegressor())])))

    results = []
    names = []
    for name, model in pipelines:
        kfold = KFold(n_splits=10, random_state=args.seed)
        cv_results = cross_val_score(model,
                                     xtrain,
                                     ytrain,
                                     cv=kfold,
                                     scoring='neg_mean_squared_error')
        results.append(cv_results)
        names.append(name)
        msg = f"{name}: {cv_results.mean()}"
        print(msg)
コード例 #30
0
ファイル: globals.py プロジェクト: john-james-sf/Ames
#                                ESTIMATORS                                   #
# =========================================================================== #
regressors = {}
regressors.update({"Linear Regression": LinearRegression()})
regressors.update({"Lasso": Lasso()})
regressors.update({"Ridge": Ridge()})
regressors.update({"ElasticNet": ElasticNet()})

ensembles = {}
ensembles.update({"AdaBoost": AdaBoostRegressor()})
ensembles.update({"Bagging": BaggingRegressor()})
ensembles.update({"Extra Trees": ExtraTreesRegressor()})
ensembles.update({"Gradient Boosting": GradientBoostingRegressor()})
ensembles.update({"Random Forest": RandomForestRegressor()})
ensembles.update(
    {"Histogram Gradient Boosting": HistGradientBoostingRegressor()})

# =========================================================================== #
#                             HYPERPARAMETERS                                 #
# =========================================================================== #
# Parameter Grid
regressor_parameters = {}
regressor_parameters.update(
    {"Linear Regression": {
        "estimator__normalize": [False]
    }})
regressor_parameters.update({
    "Lasso": {
        "estimator__alpha":
        [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0],
        "estimator__n_jobs": [-1]