def test_bin_mapper_idempotence(n_bins_small, n_bins_large): assert n_bins_large >= n_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) mapper_small = _BinMapper(max_bins=n_bins_small) mapper_large = _BinMapper(max_bins=n_bins_large) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large)
def test_subsample(): # Make sure bin thresholds are different when applying subsampling mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA) mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature], mapper_subsample.bin_thresholds_[feature], rtol=1e-4)
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01
def test_actual_n_bins(max_bins, diff): # Check that actual_n_bins is n_unique_values when # n_unique_values <= max_bins, else max_bins. n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = _BinMapper(max_bins=max_bins).fit(X) assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values))
def test_bin_mapper_small_random_data(n_samples, n_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples mapper = _BinMapper(max_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_bin_mapper_repeated_values_invariance(n_distinct): rng = np.random.RandomState(42) distinct_values = rng.normal(size=n_distinct) assert len(np.unique(distinct_values)) == n_distinct repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) data = distinct_values[repeated_indices] rng.shuffle(data) assert_array_equal(np.unique(data), np.sort(distinct_values)) data = data.reshape(-1, 1) mapper_1 = _BinMapper(max_bins=n_distinct) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3)) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2)
def test_extra_tree_classifier_proba(load_func): X, y = load_func(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state) # Data binning binner = _BinMapper(random_state=random_state) X_train_binned = binner.fit_transform(X_train) X_test_binned = binner.transform(X_test) # Ours model = ExtraTreeClassifier(random_state=random_state) model.fit(X_train_binned, y_train) actual_pred = model.predict(X_test_binned) actual_proba = model.predict_proba(X_test_binned) # Sklearn model = sklearn_ExtraTreeClassifier(random_state=random_state) model.fit(X_train_binned, y_train) expected_pred = model.predict(X_test_binned) expected_proba = model.predict_proba(X_test_binned) assert_array_equal(actual_pred, expected_pred) assert_array_equal(actual_proba, expected_proba)
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( early_stopping=True, validation_fraction=validation_fraction, random_state=rng ) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all(mapper_training_data.n_bins_non_missing_ == int((1 - validation_fraction) * n_samples)) assert np.all(mapper_training_data.n_bins_non_missing_ != mapper_whole_data.n_bins_non_missing_)
def test_forest_regressor_workflow(load_func): n_estimators = 100 # to avoid oob warning random_state = 42 X, y = load_func(return_X_y=True) # Data binning binner = _BinMapper(random_state=random_state) X_binned = binner.fit_transform(X) # Random Forest model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state) model.fit(X_binned, y) model.predict(X_binned) # Extremely Random Forest model = ExtraTreesRegressor(n_estimators=n_estimators, random_state=random_state) model.fit(X_binned, y) model.predict(X_binned)
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( n_iter_no_change=5, validation_fraction=validation_fraction, random_state=rng ) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all(mapper_training_data.actual_n_bins_ == int((1 - validation_fraction) * n_samples)) assert np.all(mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_)
def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) n_bins = 256 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower( X, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1.0, min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples, ) grower.grow() predictor = grower.make_predictor( binning_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node["is_leaf"]: assert node["count"] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]["is_leaf"] assert predictor.nodes[0]["count"] == n_samples
def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: assert bin_thresholds_feature.shape == (n_bins - 1,) assert bin_thresholds_feature.dtype == DATA.dtype assert np.all(mapper.actual_n_bins_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol
def test_crf_oob_normal(load_func): oob_n_estimators = 100 X_train, y_train = load_func(return_X_y=True) binner = _BinMapper(random_state=random_state) X_train_binned = binner.fit_transform(X_train) # Ours model = ExtraTreesClassifier(n_estimators=oob_n_estimators, bootstrap=True, oob_score=True, random_state=random_state) model.fit(X_train_binned, y_train) actual_proba = model.oob_decision_function_ # Sklearn model = sklearn_ExtraTreesClassifier(n_estimators=oob_n_estimators, bootstrap=True, oob_score=True, random_state=random_state) model.fit(X_train_binned, y_train) expected_proba = model.oob_decision_function_ assert_array_equal(actual_proba, expected_proba)
def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: assert bin_thresholds_feature.shape == (n_bins - 1,) assert bin_thresholds_feature.dtype == DATA.dtype assert np.all(mapper.actual_n_bins_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol
def test_categorical_feature(n_bins): # Basic test for categorical features # we make sure that categories are mapped into [0, n_categories - 1] and # that nans are mapped to the last bin X = np.array([[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2], dtype=X_DTYPE).T known_categories = [np.unique(X[~np.isnan(X)])] bin_mapper = _BinMapper(n_bins=n_bins, is_categorical=np.array([True]), known_categories=known_categories).fit(X) assert bin_mapper.n_bins_non_missing_ == [6] assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13]) X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T assert_array_equal(bin_mapper.transform(X), expected_trans) # For unknown categories, the mapping is incorrect / undefined. This never # happens in practice. This check is only for illustration purpose. X = np.array([[-1, 100]], dtype=X_DTYPE).T expected_trans = np.array([[0, 6]]).T assert_array_equal(bin_mapper.transform(X), expected_trans)
def test_regression_dataset(n_bins): X, y = make_regression( n_samples=500, n_features=10, n_informative=5, random_state=42 ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 10 max_leaf_nodes = 30 grower = TreeGrower( X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_, ) grower.grow() predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_) known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) f_idx_map = np.zeros(0, dtype=np.uint32) y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads) assert r2_score(y_train, y_pred_train) > 0.82 y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads) assert r2_score(y_test, y_pred_test) > 0.67
def test_bin_mapper_identity_small(max_bins, scale, offset): data = np.arange(max_bins).reshape(-1, 1) * scale + offset # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 binned = _BinMapper(n_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
import copy import pytest from deepforest._layer import Layer from deepforest._estimator import Estimator # Load utils from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.datasets import load_digits, load_boston from sklearn.model_selection import train_test_split # Load data and binning X, y = load_digits(return_X_y=True) binner = _BinMapper(random_state=142) X_binned = binner.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( X_binned, y, test_size=0.42, random_state=42 ) # Parameters layer_kwargs = { "layer_idx": 0, "n_classes": 10, "n_estimators": 1, "n_trees": 10, "max_depth": 3, "min_samples_leaf": 10, "partial_mode": False, "buffer": None, "n_jobs": -1,
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) # max_bins is the number of bins for non-missing values n_bins = max_bins + 1 binned = _BinMapper(n_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)
def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)
def test_bin_mapper_n_features_transform(): mapper = _BinMapper(max_bins=42, random_state=42).fit(DATA) err_msg = 'This estimator was fitted with 2 features but 4 got passed' with pytest.raises(ValueError, match=err_msg): mapper.transform(np.repeat(DATA, 2, axis=1))
def test_bin_mapper_identity_small(n_bins, scale, offset): data = np.arange(n_bins).reshape(-1, 1) * scale + offset binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
def test_invalid_n_bins(n_bins): err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format( n_bins ) with pytest.raises(ValueError, match=err_msg): _BinMapper(n_bins=n_bins).fit(DATA)
def test_bin_mapper_identity_small(n_bins, scale, offset): data = np.arange(n_bins).reshape(-1, 1) * scale + offset binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. # - We don't check the absolute_error loss here. This is because # LightGBM's computation of the median (used for the initial value of # raw_prediction) is a bit off (they'll e.g. return midpoints when there # is no need to.). Since these tests only run 1 iteration, the # discrepancy between the initial values leads to biggish differences in # the predictions. These differences are much smaller with more # iterations. pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01
def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)
def test_bin_mapper_n_features_transform(): mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA) err_msg = "This estimator was fitted with 2 features but 4 got passed" with pytest.raises(ValueError, match=err_msg): mapper.transform(np.repeat(DATA, 2, axis=1))
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 lr = 1 X, y = make_classification( n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0, ) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss="categorical_crossentropy", max_iter=max_iter, max_bins=max_bins, learning_rate=lr, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)