Beispiel #1
0
def test_bin_mapper_idempotence(n_bins_small, n_bins_large):
    assert n_bins_large >= n_bins_small
    data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
    mapper_small = BinMapper(max_bins=n_bins_small)
    mapper_large = BinMapper(max_bins=n_bins_large)
    binned_small = mapper_small.fit_transform(data)
    binned_large = mapper_large.fit_transform(binned_small)
    assert_array_equal(binned_small, binned_large)
Beispiel #2
0
def test_subsample():
    # Make sure bin thresholds are different when applying subsampling
    mapper_no_subsample = BinMapper(subsample=None, random_state=0).fit(DATA)
    mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA)

    for feature in range(DATA.shape[1]):
        with pytest.raises(AssertionError):
            np.testing.assert_array_almost_equal(
                mapper_no_subsample.numerical_thresholds_[feature],
                mapper_subsample.numerical_thresholds_[feature],
                decimal=3)
Beispiel #3
0
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
    # Make sure root node isn't split if n_samples is not at least twice
    # min_samples_leaf
    rng = np.random.RandomState(seed=0)

    max_bins = 255

    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    mapper = BinMapper(max_bins=max_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(np.float32)
    all_hessians = np.ones(shape=1, dtype=np.float32)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=max_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    if n_samples >= min_samples_leaf * 2:
        assert len(grower.finalized_leaves) >= 2
    else:
        assert len(grower.finalized_leaves) == 1
Beispiel #4
0
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
                          constant_hessian, noise):
    rng = np.random.RandomState(seed=0)
    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    if noise:
        y_scale = y.std()
        y += rng.normal(scale=noise, size=n_samples) * y_scale
    mapper = BinMapper(max_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(np.float32)
    if constant_hessian:
        all_hessians = np.ones(shape=1, dtype=np.float32)
    else:
        all_hessians = np.ones_like(all_gradients)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    if n_samples >= min_samples_leaf:
        for node in predictor.nodes:
            if node['is_leaf']:
                assert node['count'] >= min_samples_leaf
    else:
        assert predictor.nodes.shape[0] == 1
        assert predictor.nodes[0]['is_leaf']
        assert predictor.nodes[0]['count'] == n_samples
Beispiel #5
0
def test_boston_dataset(max_bins):
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(
        boston.data, boston.target, random_state=42)

    mapper = BinMapper(max_bins=max_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned, gradients, hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes, max_bins=max_bins,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
Beispiel #6
0
def test_boston_dataset():
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=42)

    mapper = BinMapper(random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    gradients = y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.75
    assert r2_score(y_test, predictor.predict(X_test)) > 0.65
Beispiel #7
0
def test_pre_binned_data():
    # Make sure ValueError is raised when predictor.predict() is called while
    # the predictor does not have any numerical thresholds.

    X, y = make_regression()

    # Init gradients and hessians to that of least squares loss
    gradients = -y.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)
    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()
    predictor = grower.make_predictor(numerical_thresholds=None)

    assert_raises_regex(ValueError,
                        'This predictor does not have numerical thresholds',
                        predictor.predict, X)

    assert_raises_regex(ValueError, 'binned_data dtype should be uint8',
                        predictor.predict_binned, X)

    predictor.predict_binned(X_binned)  # No error

    predictor = grower.make_predictor(
        numerical_thresholds=mapper.numerical_thresholds_)
    assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict,
                        X_binned)
Beispiel #8
0
def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
                                     max_leaf_nodes):
    # Make sure pygbm has the same predictions as LGBM for easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and PyGBM should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_regression(n_samples=n_samples,
                           n_features=5,
                           n_informative=5,
                           random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingRegressor(max_iter=max_iter,
                                          max_bins=max_bins,
                                          learning_rate=1,
                                          n_iter_no_change=None,
                                          min_samples_leaf=min_samples_leaf,
                                          max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    # less than 1% of the predictions are different up to the 3rd decimal
    assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011

    if max_leaf_nodes < 10 and n_samples >= 1000:
        pred_lgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        # less than 1% of the predictions are different up to the 4th decimal
        assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
Beispiel #9
0
def test_n_bins_per_feature(max_bins, diff):
    # Check that n_bins_per_feature is n_unique_values when
    # n_unique_values <= max_bins, else max_bins.

    n_unique_values = max_bins + diff
    X = list(range(n_unique_values)) * 2
    X = np.array(X).reshape(-1, 1)
    mapper = BinMapper(max_bins=max_bins).fit(X)
    assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values))
Beispiel #10
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           n_iter_no_change=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):
    # Make sure pygbm has the same predictions as LGBM for very easy targets.
    #
    # In particular when the size of the trees are bound and the number of
    # samples is large enough, the structure of the prediction trees found by
    # LightGBM and PyGBM should be exactly identical.
    #
    # Notes:
    # - Several candidate splits may have equal gains when the number of
    #   samples in a node is low (and because of float errors). Therefore the
    #   predictions on the test set might differ if the structure of the tree
    #   is not exactly the same. To avoid this issue we only compare the
    #   predictions on the test set when the number of samples is large enough
    #   and max_leaf_nodes is low enough.
    # - To ignore  discrepancies caused by small differences the binning
    #   strategy, data is pre-binned if n_samples > 255.

    lb = pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    min_samples_leaf = 1  # XXX: changing this breaks the test
    max_iter = 1

    # data = linear target, 5 features, 3 irrelevant.
    X = rng.normal(size=(n_samples, 5))
    y = X[:, 0] - X[:, 1]
    if n_samples > 255:
        X = BinMapper().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter,
                                    min_data_in_bin=1,
                                    learning_rate=1,
                                    min_data_in_leaf=min_samples_leaf,
                                    num_leaves=max_leaf_nodes)
    est_pygbm = GradientBoostingMachine(max_iter=max_iter,
                                        learning_rate=1,
                                        validation_split=None,
                                        scoring=None,
                                        min_samples_leaf=min_samples_leaf,
                                        max_leaf_nodes=max_leaf_nodes)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)

    if max_leaf_nodes < 10 and n_samples > 1000:
        pred_lgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)
Beispiel #12
0
def test_bin_mapper_small_random_data(n_samples, n_bins):
    data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
    assert len(np.unique(data)) == n_samples

    mapper = BinMapper(max_bins=n_bins, random_state=42)
    binned = mapper.fit_transform(data)

    assert binned.shape == data.shape
    assert binned.dtype == np.uint8
    assert_array_equal(binned.ravel()[np.argsort(data.ravel())],
                       np.arange(n_samples))
Beispiel #13
0
def test_plot_grower(tmpdir):
    pytest.importorskip('graphviz')
    from pygbm.plotting import plot_tree

    X_binned = BinMapper().fit_transform(X)
    gradients = np.asarray(y, dtype=np.float32).copy()
    hessians = np.ones(1, dtype=np.float32)
    grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=5)
    grower.grow()
    filename = tmpdir.join('plot_grower.pdf')
    plot_tree(grower, view=False, filename=filename)
    assert filename.exists()
Beispiel #14
0
def test_bin_mapper_repeated_values_invariance(n_distinct):
    rng = np.random.RandomState(42)
    distinct_values = rng.normal(size=n_distinct)
    assert len(np.unique(distinct_values)) == n_distinct

    repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
    data = distinct_values[repeated_indices]
    rng.shuffle(data)
    assert_array_equal(np.unique(data), np.sort(distinct_values))

    data = data.reshape(-1, 1)

    mapper_1 = BinMapper(max_bins=n_distinct)
    binned_1 = mapper_1.fit_transform(data)
    assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))

    # Adding more bins to the mapper yields the same results (same thresholds)
    mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3))
    binned_2 = mapper_2.fit_transform(data)

    assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
    assert_array_equal(binned_1, binned_2)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_pre_binned_data():
    # Make sure that:
    # - training on numerical data and predicting on numerical data is the
    #   same as training on binned data and predicting on binned data
    # - training on numerical data and predicting on numerical data is the
    #   same as training on numerical data and predicting on binned data
    # - training on binned data and predicting on numerical data is not
    #   possible.

    X, y = make_regression(random_state=0)
    gbdt = GradientBoostingRegressor(scoring=None, random_state=0)
    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)

    fit_num_pred_num = gbdt.fit(X, y).predict(X)
    fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned)
    fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned)

    assert_allclose(fit_num_pred_num, fit_binned_pred_binned)
    assert_allclose(fit_num_pred_num, fit_num_pred_binned)

    assert_raises_regex(ValueError,
                        'This estimator was fitted with pre-binned data ',
                        gbdt.fit(X_binned, y).predict, X)
Beispiel #17
0
def test_bin_mapper_random_data(n_bins):
    n_samples, n_features = DATA.shape

    expected_count_per_bin = n_samples // n_bins
    tol = int(0.05 * expected_count_per_bin)

    mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA)
    binned = mapper.transform(DATA)

    assert binned.shape == (n_samples, n_features)
    assert binned.dtype == np.uint8
    assert_array_equal(binned.min(axis=0), np.array([0, 0]))
    assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1]))
    assert len(mapper.numerical_thresholds_) == n_features
    for i in range(len(mapper.numerical_thresholds_)):
        assert mapper.numerical_thresholds_[i].shape == (n_bins - 1, )
        assert mapper.numerical_thresholds_[i].dtype == DATA.dtype
    assert np.all(mapper.n_bins_per_feature_ == n_bins)

    # Check that the binned data is approximately balanced across bins.
    for feature_idx in range(n_features):
        for bin_idx in range(n_bins):
            count = (binned[:, feature_idx] == bin_idx).sum()
            assert abs(count - expected_count_per_bin) < tol
Beispiel #18
0
    def fit(self, X, y):
        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = y.astype(np.float32, copy=False)
        rng = check_random_state(self.random_state)
        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                  end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")
        if self.validation_split is not None:
            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=y,
                random_state=rng)
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(np.arange(X_binned_train.shape[0]),
                                 subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")
        # TODO: plug custom loss functions
        y_pred = np.zeros_like(y_train, dtype=np.float32)
        gradients = np.asarray(y_train, dtype=np.float32).copy()
        hessians = np.ones(1, dtype=np.float32)
        self.predictors_ = predictors = []
        self.train_scores_ = []
        if self.validation_split is not None:
            self.validation_scores_ = []
        scorer = check_scoring(self, self.scoring)
        gb_start_time = time()
        # TODO: compute training loss and use it for early stopping if no
        # validation data is provided?
        self.n_iter_ = 0
        while True:
            should_stop = self._stopping_criterion(gb_start_time, scorer,
                                                   X_binned_small_train,
                                                   y_small_train, X_binned_val,
                                                   y_val)
            if should_stop or self.n_iter_ == self.max_iter:
                break
            shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate
            grower = TreeGrower(X_binned_train,
                                gradients,
                                hessians,
                                n_bins=self.max_bins,
                                max_leaf_nodes=self.max_leaf_nodes,
                                max_depth=self.max_depth,
                                min_samples_leaf=self.min_samples_leaf,
                                shrinkage=shrinkage)
            grower.grow()
            predictor = grower.make_predictor(
                bin_thresholds=self.bin_mapper_.bin_thresholds_)
            predictors.append(predictor)
            self.n_iter_ += 1
            tic_pred = time()
            leaves_data = [(l.value, l.sample_indices)
                           for l in grower.finalized_leaves]
            _update_y_pred(leaves_data, y_pred)
            gradients = y_train - y_pred
            toc_pred = time()
            acc_prediction_time += toc_pred - tic_pred

            acc_apply_split_time += grower.total_apply_split_time
            acc_find_split_time += grower.total_find_split_time
        if self.verbose:
            duration = time() - fit_start_time
            n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_)
            print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, "
                  f"({n_leaf_nodes} total leaf nodes)")
            print('{:<32} {:.3f}s'.format('Time spent finding best splits:',
                                          acc_find_split_time))
            print('{:<32} {:.3f}s'.format('Time spent applying splits:',
                                          acc_apply_split_time))
            print('{:<32} {:.3f}s'.format('Time spent predicting:',
                                          acc_prediction_time))
        self.train_scores_ = np.asarray(self.train_scores_)
        if self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
from pygbm.binning import BinMapper
from pygbm.grower import TreeGrower
from pygbm import plotting

rng = np.random.RandomState(0)

n_samples = int(1e7)
n_leaf_nodes = 5
X, y = make_classification(n_samples=n_samples,
                           n_classes=2,
                           n_features=5,
                           n_informative=3,
                           n_redundant=0,
                           random_state=rng)

bin_mapper_ = BinMapper(random_state=rng)
X_binned = bin_mapper_.fit_transform(X)

gradients = np.asarray(y, dtype=np.float32).copy()
hessians = np.ones(1, dtype=np.float32)

# First run to trigger the compilation of numba jit methods to avoid recording
# the compiler overhead in the profile report.
TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes).grow()

# New run with to collect timing statistics that will be included in the plot.
grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes)
grower.grow()
plotting.plot_tree(grower)
Beispiel #20
0
from numpy.testing import assert_allclose
from sklearn.datasets import make_regression
from pygbm.binning import BinMapper
from pygbm import GradientBoostingRegressor

n_samples = int(5e6)

X, y = make_regression(n_samples=n_samples, n_features=5)
est = GradientBoostingRegressor(max_iter=1,
                                scoring=None,
                                validation_split=None,
                                random_state=0)
est.fit(X, y)
predictor = est.predictors_[0][0]

bin_mapper = BinMapper(random_state=0)
X_binned = bin_mapper.fit_transform(X)

X_binned_c = np.ascontiguousarray(X_binned)
print("Compiling predictor code...")
tic = time()
predictor.predict_binned(np.asfortranarray(X_binned[:100]))
predictor.predict_binned(X_binned_c[:100])
predictor.predict(np.asfortranarray(X[:100]))
predictor.predict(X[:100])
toc = time()
print(f"done in {toc - tic:0.3f}s")

data_size = X_binned.nbytes
print("Computing predictions (F-contiguous binned data)...")
tic = time()
def test_same_predictions_multiclass_classification(seed, min_samples_leaf,
                                                    n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples,
                               n_classes=3,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               random_state=0)

    if n_samples > 255:
        X = BinMapper(max_bins=max_bins).fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy',
                                           max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=lr,
                                           validation_split=None,
                                           scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_lightgbm_estimator(est_pygbm)

    est_lightgbm.fit(X_train, y_train)
    est_pygbm.fit(X_train, y_train)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_pygbm = est_pygbm.predict(X_train)
    assert np.mean(pred_pygbm == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_pygbm = est_pygbm.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

    acc_lgbm = accuracy_score(y_train, pred_lightgbm)
    acc_pygbm = accuracy_score(y_train, pred_pygbm)
    np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_pygbm = est_pygbm.predict(X_test)
        assert np.mean(pred_pygbm == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_pygbm = est_pygbm.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75

        acc_lgbm = accuracy_score(y_test, pred_lightgbm)
        acc_pygbm = accuracy_score(y_test, pred_pygbm)
        np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples. If ``X.dtype == np.uint8``, the data is
            assumed to be pre-binned and the prediction methods
            (``predict``, ``predict_proba``) will only accept pre-binned
            data as well.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        self.multi_output = len(y.ravel()) != len(y)
        if self.multi_output:
            self.prediction_dim = y.shape[1]
        else:
            self.prediction_dim = 1
        X, y = check_X_y(X,
                         y,
                         dtype=[np.float32, np.float64, np.uint8],
                         multi_output=self.multi_output)
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.')
        rng = check_random_state(self.random_state)

        self._validate_parameters(X)
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if X.dtype == np.uint8:  # data is pre-binned
            if self.verbose:
                print("X is pre-binned.")
            X_binned = X
            self.bin_mapper_ = None
            numerical_thresholds = None
            n_bins_per_feature = X.max(axis=0).astype(np.uint32)
        else:
            if self.verbose:
                print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                      end="",
                      flush=True)
            tic = time()
            self.bin_mapper_ = BinMapper(max_bins=self.max_bins,
                                         random_state=rng)
            X_binned = self.bin_mapper_.fit_transform(X)
            numerical_thresholds = self.bin_mapper_.numerical_thresholds_
            n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_
            toc = time()

            if self.verbose:
                duration = toc - tic
                throughput = X.nbytes / duration
                print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        do_early_stopping = (self.n_iter_no_change is not None
                             and self.n_iter_no_change > 0)

        if do_early_stopping and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=stratify,
                random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.')
            # Predicting is faster of C-contiguous arrays, training is faster
            # on Fortran arrays.
            X_binned_val = np.ascontiguousarray(X_binned_val)
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        if do_early_stopping:
            subsample_size = 10000
            n_samples_train = X_binned_train.shape[0]
            if n_samples_train > subsample_size:
                indices = rng.choice(X_binned_train.shape[0], subsample_size)
                X_binned_small_train = X_binned_train[indices]
                y_small_train = y_train[indices]
            else:
                X_binned_small_train = X_binned_train
                y_small_train = y_train
            # Predicting is faster of C-contiguous arrays.
            X_binned_small_train = np.ascontiguousarray(X_binned_small_train)

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        self.baseline_prediction_ = self.loss_.get_baseline_prediction(
            y_train, self.prediction_dim)
        # raw_predictions are the accumulated values predicted by the trees
        # for the training data.
        raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim),
                                   dtype=self.baseline_prediction_.dtype)
        if not self.multi_output:
            raw_predictions = raw_predictions.ravel()
        raw_predictions += self.baseline_prediction_

        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples, prediction_dim=self.prediction_dim)
        if not self.multi_output:
            gradients = gradients.ravel()
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        # scorer_ is a callable with signature (est, X, y) and calls
        # est.predict() or est.predict_proba() depending on its nature.
        self.scorer_ = check_scoring(self, self.scoring)
        self.train_scores_ = []
        self.validation_scores_ = []
        if do_early_stopping:
            # Add predictions of the initial model (before the first tree)
            self.train_scores_.append(self._get_scores(X_binned_train,
                                                       y_train))

            if self.validation_split is not None:
                self.validation_scores_.append(
                    self._get_scores(X_binned_val, y_val))

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ",
                      end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])
            if self.multi_output:
                proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians(
                    gradients, hessians)
            else:
                proj_gradients, proj_hessians = gradients.ravel(
                ), hessians.ravel()

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(
                    zip(
                        np.array_split(proj_gradients,
                                       self.n_trees_per_iteration_),
                        np.array_split(proj_hessians,
                                       self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(X_binned_train,
                                    gradients_at_k,
                                    hessians_at_k,
                                    max_bins=self.max_bins,
                                    n_bins_per_feature=n_bins_per_feature,
                                    max_leaf_nodes=self.max_leaf_nodes,
                                    max_depth=self.max_depth,
                                    min_samples_leaf=self.min_samples_leaf,
                                    l2_regularization=self.l2_regularization,
                                    shrinkage=self.learning_rate)
                grower.grow()

                if self.multi_output:
                    for l in grower.finalized_leaves:
                        l.residual = (
                            -self.learning_rate *
                            np.sum(a=gradients[l.sample_indices, :], axis=0) /
                            (l.sum_hessians + self.l2_regularization +
                             np.finfo(np.float64).eps))
                    leaves_data = [(l.residual, l.sample_indices)
                                   for l in grower.finalized_leaves]
                else:
                    leaves_data = [(l.value, l.sample_indices)
                                   for l in grower.finalized_leaves]

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(numerical_thresholds)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted

                _update_raw_predictions(leaves_data, raw_predictions)
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_early_stop = False
            if do_early_stopping:
                should_early_stop = self._check_early_stopping(
                    X_binned_small_train, y_small_train, X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time,
                                            do_early_stopping)

            if should_early_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
Beispiel #23
0
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        # TODO: test input checking
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.'
            )
        rng = check_random_state(self.random_state)

        self._validate_parameters()
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        if self.scoring is not None and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned, y, test_size=self.validation_split,
                stratify=stratify, random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.'
                )
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(
                np.arange(X_binned_train.shape[0]), subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        # values predicted by the trees. Used as-is in regression, and
        # transformed into probas and / or classes for classification
        raw_predictions = np.zeros(
            shape=(n_samples, self.n_trees_per_iteration_),
            dtype=y_train.dtype
        )
        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            n_trees_per_iteration=self.n_trees_per_iteration_
        )
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        scorer = check_scoring(self, self.scoring)
        self.train_scores_ = []
        if self.scoring is not None:
            # Add predictions of the initial model (before the first tree)
            predicted_train = self._predict_binned(X_binned_train)
            score_train = scorer._sign * scorer._score_func(y_train,
                                                            predicted_train)
            self.train_scores_.append(score_train)

            if self.validation_split is not None:
                self.validation_scores_ = []
                predicted_val = self._predict_binned(X_binned_val)
                score_val = scorer._sign * scorer._score_func(y_val,
                                                              predicted_val)
                self.validation_scores_.append(score_val)

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ", end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(zip(
                    np.array_split(gradients, self.n_trees_per_iteration_),
                    np.array_split(hessians, self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(
                    X_binned_train, gradients_at_k, hessians_at_k,
                    max_bins=self.max_bins,
                    n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted
                leaves_data = [(l.value, l.sample_indices)
                               for l in grower.finalized_leaves]
                _update_raw_predictions(leaves_data, raw_predictions[:, k])
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_stop = self._check_early_stopping(
                scorer, X_binned_small_train, y_small_train,
                X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time)

            if should_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        if self.scoring is not None and self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
Beispiel #24
0
def test_bin_mapper_identity_small(n_bins, scale, offset):
    data = np.arange(n_bins).reshape(-1, 1) * scale + offset
    binned = BinMapper(max_bins=n_bins).fit_transform(data)
    assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
Beispiel #25
0
def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier):
    data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
    binned = BinMapper(max_bins=n_bins).fit_transform(data)
    assert_array_equal(data, binned)