Esempio n. 1
0
def test_plot_grower(tmpdir, classification_data):
    pytest.importorskip('graphviz')
    from pygbm.plotting import plot_tree

    dataset = Dataset(classification_data[0], classification_data[1])
    n_trees_per_iteration = 1
    loss = BinaryCrossEntropy()

    clf = GradientBoostingClassifier()

    gradients, hessians = loss.init_gradients_and_hessians(
        n_samples=dataset.shape[0], prediction_dim=n_trees_per_iteration)
    y = clf._encode_y(dataset.y)
    baseline_prediction_ = loss.get_baseline_prediction(y, 1)
    raw_predictions = np.zeros(shape=(dataset.shape[0], n_trees_per_iteration),
                               dtype=baseline_prediction_.dtype)
    raw_predictions += baseline_prediction_

    loss.update_gradients_and_hessians(gradients, hessians, y, raw_predictions)

    options = OptionSet(clf.parameter_dict)
    options['max_leaf_nodes'] = 5

    grower = TreeGrower(dataset, gradients, hessians, options)
    grower.grow()
    filename = tmpdir.join('plot_grower.pdf')
    plot_tree(grower, view=False, filename=filename)
    assert filename.exists()
Esempio n. 2
0
def test_early_stopping_classification(data, scoring, validation_split,
                                       n_iter_no_change, tol):

    max_iter = 500

    X, y = data
    if validation_split is not None:
        X, X_test, y, y_test = train_test_split(X,
                                                y,
                                                test_size=validation_split,
                                                random_state=42)
        eval_set = (X_test, y_test)
    else:
        eval_set = None

    gb = GradientBoostingClassifier(
        verbose=True,  # just for coverage
        scoring=scoring,
        tol=tol,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0)
    gb.fit(X, y, eval_set=eval_set)

    if n_iter_no_change != -1:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Esempio n. 3
0
def test_plot_estimator_and_lightgbm(tmpdir):
    pytest.importorskip('graphviz')
    lightgbm = pytest.importorskip('lightgbm')
    from pygbm.plotting import plot_tree

    n_classes = 3
    X, y = make_classification(n_samples=150,
                               n_classes=n_classes,
                               n_features=5,
                               n_informative=3,
                               n_redundant=0,
                               random_state=0)

    n_trees = 3
    est_pygbm = GradientBoostingClassifier(max_iter=n_trees,
                                           n_iter_no_change=None)
    est_pygbm.fit(X, y)
    est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees)
    est_lightgbm.fit(X, y)

    n_total_trees = n_trees * n_classes
    for i in range(n_total_trees):
        filename = tmpdir.join('plot_mixed_predictors.pdf')
        plot_tree(est_pygbm,
                  est_lightgbm=est_lightgbm,
                  tree_index=i,
                  view=False,
                  filename=filename)
        assert filename.exists()
Esempio n. 4
0
def test_early_stopping_loss(n_samples, max_iter, n_iter_no_change, tree_type):
    # Make sure that when scoring is None, the early stopping is done w.r.t to
    # the loss. Using scoring='neg_log_loss' and scoring=None should be
    # equivalent since the loss is precisely the negative log likelihood

    X, y = make_classification(n_samples, random_state=0)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

    clf_scoring = GradientBoostingClassifier(max_iter=max_iter,
                                             scoring='neg_log_loss',
                                             n_iter_no_change=n_iter_no_change,
                                             tol=1e-4,
                                             verbose=True,
                                             random_state=0,
                                             tree_type=tree_type)
    clf_scoring.fit(X, y, eval_set=(X_val, y_val))

    clf_loss = GradientBoostingClassifier(max_iter=max_iter,
                                          scoring=None,
                                          n_iter_no_change=n_iter_no_change,
                                          tol=1e-4,
                                          verbose=True,
                                          random_state=0,
                                          tree_type=tree_type)
    clf_loss.fit(X, y, eval_set=(X_val, y_val))

    assert n_iter_no_change < clf_loss.n_iter_ < max_iter
    assert clf_loss.n_iter_ == clf_scoring.n_iter_
def test_early_stopping_loss():
    # Make sure that when scoring is None, the early stopping is done w.r.t to
    # the loss. Using scoring='neg_log_loss' and scoring=None should be
    # equivalent since the loss is precisely the negative log likelihood
    n_samples = int(1e3)
    max_iter = 100
    n_iter_no_change = 5

    X, y = make_classification(n_samples, random_state=0)

    clf_scoring = GradientBoostingClassifier(max_iter=max_iter,
                                             scoring='neg_log_loss',
                                             validation_split=.1,
                                             n_iter_no_change=n_iter_no_change,
                                             tol=1e-4,
                                             verbose=1,
                                             random_state=0)
    clf_scoring.fit(X, y)

    clf_loss = GradientBoostingClassifier(max_iter=max_iter,
                                          scoring=None,
                                          validation_split=.1,
                                          n_iter_no_change=n_iter_no_change,
                                          tol=1e-4,
                                          verbose=1,
                                          random_state=0)
    clf_loss.fit(X, y)

    assert n_iter_no_change < clf_loss.n_iter_ < max_iter
    assert clf_loss.n_iter_ == clf_scoring.n_iter_
Esempio n. 6
0
def test_one_sample_one_feature():
    # Until numba issue #3569 is fixed, we raise an informative error message
    # when X is only one sample or one feature in fit (it's OK in predict).
    # The array is both F and C contiguous, and numba can't compile.
    gb = GradientBoostingClassifier()
    for X, y in (([[1, 2]], [0]), ([[1], [2]], [0, 1])):
        assert_raises_regex(
            ValueError,
            'Passing only one sample or one feature is not supported yet.',
            gb.fit, X, y)
Esempio n. 7
0
def test_early_stopping_classification(data, scoring, validation_split, tol):

    max_iter = 500
    n_iter_no_change = 5

    X, y = data

    gb = GradientBoostingClassifier(
        verbose=1,  # just for coverage
        scoring=scoring,
        tol=tol,
        validation_split=validation_split,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0)
    gb.fit(X, y)

    if scoring is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Esempio n. 8
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           n_iter_no_change=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
Esempio n. 9
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
Esempio n. 10
0
def test_same_predictions_multiclass_classification(seed, min_samples_leaf,
                                                    n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples,
                               n_classes=3,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=lr,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_pygbm = est_pygbm.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_pygbm = est_pygbm.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
Esempio n. 11
0
 def should_stop(scores, n_iter_no_change, tol, tree_type):
     gbdt = GradientBoostingClassifier(n_iter_no_change=n_iter_no_change,
                                       tol=tol,
                                       tree_type=tree_type)
     gbdt._validate_parameters()
     return gbdt._should_stop(scores)
Esempio n. 12
0
        if (hasattr(check, 'func')
                and check.func is estimator_checks.check_classifiers_train):
            continue  # same, wrapped in a functools.partial object.

        try:
            check(name, estimator)
        except SkipTest as exception:
            # the only SkipTest thrown currently results from not
            # being able to import pandas.
            warnings.warn(str(exception), SkipTestWarning)


@pytest.mark.skipif(int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1,
                    reason="Potentially long")
@pytest.mark.parametrize('Estimator', (
    GradientBoostingRegressor(),
    GradientBoostingClassifier(n_iter_no_change=None, min_samples_leaf=5),
))
def test_estimator_checks(Estimator):
    # Run the check_estimator() test suite on GBRegressor and GBClassifier.

    # Notes:
    # - Can't do early stopping with classifier because often
    #   validation_split=.1 leads to test_size=2 < n_classes and
    #   train_test_split raises an error.
    # - Also, need to set a low min_samples_leaf for
    #   check_classifiers_classes() to pass: with only 30 samples on the
    #   dataset, the root is never split with min_samples_leaf=20 and only the
    #   majority class is predicted.
    custom_check_estimator(Estimator)
Esempio n. 13
0
 def should_stop(scores, n_iter_no_change, tol):
     gbdt = GradientBoostingClassifier(n_iter_no_change=n_iter_no_change,
                                       tol=tol)
     return gbdt._should_stop(scores)
Esempio n. 14
0
        if (hasattr(check, 'func')
                and check.func is estimator_checks.check_classifiers_train):
            continue  # same, wrapped in a functools.partial object.

        try:
            check(name, estimator)
        except SkipTest as exception:
            # the only SkipTest thrown currently results from not
            # being able to import pandas.
            warnings.warn(str(exception), SkipTestWarning)


@pytest.mark.skipif(int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1,
                    reason="Potentially long")
@pytest.mark.parametrize('Estimator', (
    GradientBoostingRegressor(),
    GradientBoostingClassifier(scoring=None, min_samples_leaf=5),
))
def test_estimator_checks(Estimator):
    # Run the check_estimator() test suite on GBRegressor and GBClassifier.

    # Notes:
    # - Can't do early stopping with classifier because often
    #   validation_split=.1 leads to test_size=2 < n_classes and
    #   train_test_split raises an error.
    # - Also, need to set a low min_samples_leaf for
    #   check_classifiers_classes() to pass: with only 30 samples on the
    #   dataset, the root is never split with min_samples_leaf=20 and only the
    #   majority class is predicted.
    custom_check_estimator(Estimator)
Esempio n. 15
0
data = np.ascontiguousarray(df.values[:, 1:])
data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=50000, random_state=0)

if subsample is not None:
    data_train, target_train = data_train[:subsample], target_train[:subsample]

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("JIT compiling code for the pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=1,
                                         max_bins=max_bins,
                                         max_leaf_nodes=n_leaf_nodes,
                                         n_iter_no_change=None,
                                         random_state=0,
                                         verbose=False,
					                     tree_type=tree_type)
pygbm_model.fit(data_train[:100], target_train[:100])
pygbm_model.predict(data_train[:100])  # prediction code is also jitted
toc = time()
print(f"done in {toc - tic:.3f}s")

print("Fitting a pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy',
                                         learning_rate=lr, max_iter=n_trees,
                                         max_bins=max_bins,
                                         max_leaf_nodes=n_leaf_nodes,
                                         n_iter_no_change=None,
Esempio n. 16
0
data = np.ascontiguousarray(df.values[:, 1:])
data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=50000, random_state=0)

if subsample is not None:
    data_train, target_train = data_train[:subsample], target_train[:subsample]

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("JIT compiling code for the pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(learning_rate=lr,
                                         max_iter=1,
                                         max_bins=max_bins,
                                         max_leaf_nodes=n_leaf_nodes,
                                         random_state=0,
                                         scoring=None,
                                         verbose=0,
                                         validation_split=None)
pygbm_model.fit(data_train[:100], target_train[:100])
pygbm_model.predict(data_train[:100])  # prediction code is also jitted
toc = time()
print(f"done in {toc - tic:.3f}s")

print("Fitting a pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy',
                                         learning_rate=lr,
                                         max_iter=n_trees,
                                         max_bins=max_bins,
                                         max_leaf_nodes=n_leaf_nodes,
Esempio n. 17
0
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from pygbm import GradientBoostingClassifier

rng = np.random.RandomState(0)

n_samples = int(1e6)
X, y = make_classification(n_samples, random_state=rng)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

print('Early stopping on held-out validation data')
clf = GradientBoostingClassifier(max_iter=100,
                                 scoring='neg_log_loss',
                                 validation_split=.1,
                                 n_iter_no_change=5,
                                 tol=1e-4,
                                 verbose=1,
                                 random_state=rng)
clf.fit(X_train, y_train)
print(f'Early stopped at iteration {clf.n_iter_}')
print(f'Mean accuracy: {clf.score(X_test, y_test)}')

print('Early stopping on training data')
clf = GradientBoostingClassifier(max_iter=100,
                                 scoring='neg_log_loss',
                                 validation_split=None,
                                 n_iter_no_change=5,
                                 tol=1e-4,
                                 verbose=1,
                                 random_state=rng)