def test_bin_mapper_idempotence(max_bins_small, max_bins_large): assert max_bins_large >= max_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) mapper_small = _BinMapper(n_bins=max_bins_small + 1) mapper_large = _BinMapper(n_bins=max_bins_small + 1) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large)
def test_subsample(): # Make sure bin thresholds are different when applying subsampling mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA) mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature], mapper_subsample.bin_thresholds_[feature], rtol=1e-4)
def test_bin_mapper_random_data(max_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // max_bins tol = int(0.05 * expected_count_per_bin) # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: assert bin_thresholds_feature.shape == (max_bins - 1, ) assert bin_thresholds_feature.dtype == DATA.dtype assert np.all(mapper.n_bins_non_missing_ == max_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(max_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol
def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) n_bins = 256 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_n_bins_non_missing(n_bins, diff): # Check that n_bins_non_missing is n_unique_values when # there are not a lot of unique values, else n_bins - 1. n_unique_values = n_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = _BinMapper(n_bins=n_bins).fit(X) assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values))
def test_infinite_values(): # Make sure infinite values are properly handled. bin_mapper = _BinMapper() X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) bin_mapper.fit(X) assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF]) assert bin_mapper.n_bins_non_missing_ == [4] expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) assert_array_equal(bin_mapper.transform(X), expected_binned_X)
def test_bin_mapper_small_random_data(n_samples, max_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 mapper = _BinMapper(n_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
def test_bin_mapper_repeated_values_invariance(n_distinct): rng = np.random.RandomState(42) distinct_values = rng.normal(size=n_distinct) assert len(np.unique(distinct_values)) == n_distinct repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) data = distinct_values[repeated_indices] rng.shuffle(data) assert_array_equal(np.unique(data), np.sort(distinct_values)) data = data.reshape(-1, 1) mapper_1 = _BinMapper(n_bins=n_distinct + 1) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2)
def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) n_bins = 256 n_samples = 1000 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth) grower.grow() depth = max(leaf.depth for leaf in grower.finalized_leaves) assert depth == max_depth
def test_split_on_nan_with_infinite_values(): # Make sure the split on nan situations are respected even when there are # samples with +inf values (we set the threshold to +inf when we have a # split on nan so this test makes sure this does not introduce edge-case # bugs). We need to use the private API so that we can also test # predict_binned(). X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) # the gradient values will force a split on nan situation gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) bin_mapper = _BinMapper() X_binned = bin_mapper.fit_transform(X) n_bins_non_missing = 3 has_missing_values = True grower = TreeGrower(X_binned, gradients, hessians, n_bins_non_missing=n_bins_non_missing, has_missing_values=has_missing_values, min_samples_leaf=1) grower.grow() predictor = grower.make_predictor( bin_thresholds=bin_mapper.bin_thresholds_) # sanity check: this was a split on nan assert predictor.nodes[0]['threshold'] == np.inf assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1 # Make sure in particular that the +inf sample is mapped to the left child # Note that lightgbm "fails" here and will assign the inf sample to the # right child, even though it's a "split on nan" situation. predictions = predictor.predict(X) predictions_binned = predictor.predict_binned( X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_) assert np.all(predictions == -gradients) assert np.all(predictions_binned == -gradients)
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): # check for missing values: make sure nans are mapped to the last bin # and that the _BinMapper attributes are correct X = [[1, 1, 0], [np.NaN, np.NaN, 0], [2, 1, 0], [np.NaN, 2, 1], [3, 2, 1], [4, 1, 0]] X = np.array(X) mapper = _BinMapper(n_bins=n_bins) mapper.fit(X) assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing) for feature_idx in range(X.shape[1]): assert len(mapper.bin_thresholds_[feature_idx]) == \ n_bins_non_missing[feature_idx] - 1 assert mapper.missing_values_bin_idx_ == n_bins - 1 X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected)
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( n_iter_no_change=5, validation_fraction=validation_fraction, random_state=rng) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all(mapper_training_data.n_bins_non_missing_ == int( (1 - validation_fraction) * n_samples)) assert np.all(mapper_training_data.n_bins_non_missing_ != mapper_whole_data.n_bins_non_missing_)
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn_lib has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn_lib should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. # - We don't check the least_absolute_deviation loss here. This is because # LightGBM's computation of the median (used for the initial value of # raw_prediction) is a bit off (they'll e.g. return midpoints when there # is no need to.). Since these tests only run 1 iteration, the # discrepancy between the initial values leads to biggish differences in # the predictions. These differences are much smaller with more # iterations. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_bin_mapper_identity_small(max_bins, scale, offset): data = np.arange(max_bins).reshape(-1, 1) * scale + offset # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 binned = _BinMapper(n_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
def test_invalid_n_bins(n_bins): err_msg = ('n_bins={} should be no smaller than 3 and no larger than 256'. format(n_bins)) with pytest.raises(ValueError, match=err_msg): _BinMapper(n_bins=n_bins).fit(DATA)
def test_bin_mapper_n_features_transform(): mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA) err_msg = 'This estimator was fitted with 2 features but 4 got passed' with pytest.raises(ValueError, match=err_msg): mapper.transform(np.repeat(DATA, 2, axis=1))
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 binned = _BinMapper(n_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)