コード例 #1
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure pygbm has the same predictions as LGBM for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and PyGBM should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingRegressor(max_iter=max_iter,
                                          max_bins=max_bins,
                                          learning_rate=1,
                                          n_iter_no_change=None,
                                          min_samples_leaf=min_samples_leaf,
                                          max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
コード例 #2
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           n_iter_no_change=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
コード例 #3
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
コード例 #4
0
tic = time()
pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy',
                                         learning_rate=lr,
                                         max_iter=n_trees,
                                         max_bins=max_bins,
                                         max_leaf_nodes=n_leaf_nodes,
                                         random_state=0,
                                         scoring=None,
                                         verbose=1,
                                         validation_split=None)
pygbm_model.fit(data_train, target_train)
toc = time()
predicted_test = pygbm_model.predict(data_test)
roc_auc = roc_auc_score(target_test, predicted_test)
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if hasattr(numba, 'threading_layer'):
    print("Threading layer chosen: %s" % numba.threading_layer())

if not args.no_lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_model = get_lightgbm_estimator(pygbm_model)
    lightgbm_model.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_model.predict(data_test)
    roc_auc = roc_auc_score(target_test, predicted_test)
    acc = accuracy_score(target_test, predicted_test)
    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
コード例 #5
0
def test_same_predictions_multiclass_classification(seed, min_samples_leaf,
                                                    n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples,
                               n_classes=3,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=lr,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_pygbm = est_pygbm.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_pygbm = est_pygbm.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)