def test_scm1d():
    df = pd.read_csv('scm1d.csv')
    target = df.loc[:, df.columns.str.contains('L')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=1658.0 /
                                                        8145.0,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.07054193143238725,
        min_samples_leaf=23,
        learning_rate=0.12336530854190006,
        max_iter=1999,
        n_iter_no_change=None,
    )
    # scaler = StandardScaler()
    # scaler.fit(X_train)
    # train_scaled = scaler.transform(X_train)
    # test_scaled = scaler.transform(X_test)
    # scalery = StandardScaler()
    # scalery.fit(y_train)
    # y_Train = scalery.transform(y_train)
    # y_Test = scalery.transform(y_test)
    gb.fit(X_train, y_train)
    y_preds = gb.predict_multi(X_test)
    r2 = r2_score(y_test, y_preds, multioutput='uniform_average')
    print(r2)
def test_atp1d():
    df = pd.read_csv('atp1d.csv')
    target = df.loc[:, df.columns.str.startswith('LBL')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(l2_regularization=0.003391634274257872,
                                   min_samples_leaf=10,
                                   learning_rate=0.1088115324113492,
                                   max_iter=199,
                                   n_iter_no_change=20)
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict_multi(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
def test_edm():
    df = pd.read_csv('edm.csv')
    target = df.loc[:, ['DFlow', 'DGap']]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.880826520747869,
        min_samples_leaf=12,
        learning_rate=0.22445307581959334,
        max_iter=279,
        n_iter_no_change=23,
    )
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict_multi(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
def test_wq():
    df = pd.read_csv('water-quality.csv')
    target = df.loc[:, df.columns.str.startswith('x')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.5,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(l2_regularization=0.07509314619453317,
                                   min_samples_leaf=15,
                                   learning_rate=0.01948991297099692,
                                   max_iter=300,
                                   n_iter_no_change=17)
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict_multi(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
def test_scm20d():
    df = pd.read_csv('scm20d.csv')
    target = df.loc[:, df.columns.str.contains('L')]
    df.drop(target.columns, axis=1, inplace=True)
    df, target = df.to_numpy(), target.to_numpy()
    c = 1503.0 / 7463.0
    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=c,
                                                        random_state=42,
                                                        shuffle=True)
    gb = GradientBoostingRegressor(
        l2_regularization=0.8640187696889217,
        min_samples_leaf=19,
        learning_rate=0.1164232801613771,
        max_iter=1998,
        n_iter_no_change=None,
    )
    scaler = StandardScaler()
    scaler.fit(X_train)
    train_scaled = scaler.transform(X_train)
    test_scaled = scaler.transform(X_test)
    scalery = StandardScaler()
    scalery.fit(y_train)
    y_Train = scalery.transform(y_train)
    y_Test = scalery.transform(y_test)
    gb.fit(train_scaled, y_Train)
    y_preds = gb.predict_multi(test_scaled)
    r2 = r2_score(y_Test, y_preds, multioutput='uniform_average')
    print(r2)
Esempio n. 6
0
def test_early_stopping_regression(scoring, validation_split, tol):

    max_iter = 500
    n_iter_no_change = 5

    X, y = make_regression(random_state=0)

    gb = GradientBoostingRegressor(
        verbose=1,  # just for coverage
        scoring=scoring,
        tol=tol,
        validation_split=validation_split,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0)
    gb.fit(X, y)

    if scoring is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Esempio n. 7
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure pygbm has the same predictions as LGBM for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and PyGBM should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingRegressor(max_iter=max_iter,
                                          max_bins=max_bins,
                                          learning_rate=1,
                                          n_iter_no_change=None,
                                          min_samples_leaf=min_samples_leaf,
                                          max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
def test_pre_binned_data():
    # Make sure that:
    # - training on numerical data and predicting on numerical data is the
    #   same as training on binned data and predicting on binned data
    # - training on numerical data and predicting on numerical data is the
    #   same as training on numerical data and predicting on binned data
    # - training on binned data and predicting on numerical data is not
    #   possible.

    X, y = make_regression(random_state=0)
    gbdt = GradientBoostingRegressor(scoring=None, random_state=0)
    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)

    fit_num_pred_num = gbdt.fit(X, y).predict(X)
    fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned)
    fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned)

    assert_allclose(fit_num_pred_num, fit_binned_pred_binned)
    assert_allclose(fit_num_pred_num, fit_num_pred_binned)

    assert_raises_regex(ValueError,
                        'This estimator was fitted with pre-binned data ',
                        gbdt.fit(X_binned, y).predict, X)
Esempio n. 9
0
"""This example illustrates the use of scikit-learn's GridSearchCV.

The grid search is used to determine the best learning rate."""

import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from pygbm import GradientBoostingRegressor

rng = np.random.RandomState(0)

n_samples = int(1e6)
X, y = make_regression(n_samples, random_state=rng)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

clf = GradientBoostingRegressor(max_iter=10,
                                scoring=None,
                                verbose=1,
                                random_state=rng)
param_grid = {'learning_rate': [1, .1, .01, .001]}
cv = KFold(n_splits=3, random_state=rng)
gs = GridSearchCV(clf, param_grid=param_grid, cv=cv)
gs.fit(X_train, y_train)

print(f'Best param: {gs.best_params_}')
print(f'R2 coefficient: {gs.score(X_test, y_test)}')
Esempio n. 10
0
"""This example illustrates the use of scikit-learn's GridSearchCV.

The grid search is used to determine the best learning rate."""

import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from pygbm import GradientBoostingRegressor

rng = np.random.RandomState(0)

n_samples = int(1e6)
X, y = make_regression(n_samples, random_state=rng)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

clf = GradientBoostingRegressor(max_iter=10,
                                n_iter_no_change=None,
                                verbose=1,
                                random_state=rng)
param_grid = {'learning_rate': [1, .1, .01, .001]}
cv = KFold(n_splits=3, random_state=rng)
gs = GridSearchCV(clf, param_grid=param_grid, cv=cv)
gs.fit(X_train, y_train)

print(f'Best param: {gs.best_params_}')
print(f'R2 coefficient: {gs.score(X_test, y_test)}')