def test_early_stopping_on_test_set_with_warm_start(): # Non regression test for #16661 where second fit fails with # warm_start=True, early_stopping is on, and no validation set X, y = make_classification(random_state=0) gb = HistGradientBoostingClassifier( max_iter=1, scoring='loss', warm_start=True, early_stopping=True, n_iter_no_change=1, validation_fraction=None) gb.fit(X, y) # does not raise on second call gb.set_params(max_iter=2) gb.fit(X, y)
def test_categorical_encoding_strategies(): # Check native categorical handling vs different encoding strategies. We # make sure that native encoding needs only 1 split to achieve a perfect # prediction on a simple dataset. In contrast, OneHotEncoded data needs # more depth / splits, and treating categories as ordered (just using # OrdinalEncoder) requires even more depth. # dataset with one random continuous feature, and one categorical feature # with values in [0, 5], e.g. from an OrdinalEncoder. # class == 1 iff categorical value in {0, 2, 4} rng = np.random.RandomState(0) n_samples = 10_000 f1 = rng.rand(n_samples) f2 = rng.randint(6, size=n_samples) X = np.c_[f1, f2] y = np.zeros(shape=n_samples) y[X[:, 1] % 2 == 0] = 1 # make sure dataset is balanced so that the baseline_prediction doesn't # influence predictions too much with max_iter = 1 assert 0.49 < y.mean() < 0.51 clf_cat = HistGradientBoostingClassifier( max_iter=1, max_depth=1, categorical_features=[False, True] ) # Using native categorical encoding, we get perfect predictions with just # one split assert cross_val_score(clf_cat, X, y).mean() == 1 # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21 expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0] left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0] assert_array_equal(left_bitset, expected_left_bitset) # Treating categories as ordered, we need more depth / more splits to get # the same predictions clf_no_cat = HistGradientBoostingClassifier( max_iter=1, max_depth=4, categorical_features=None ) assert cross_val_score(clf_no_cat, X, y).mean() < 0.9 clf_no_cat.set_params(max_depth=5) assert cross_val_score(clf_no_cat, X, y).mean() == 1 # Using OHEd data, we need less splits than with pure OEd data, but we # still need more splits than with the native categorical splits ct = make_column_transformer( (OneHotEncoder(sparse=False), [1]), remainder="passthrough" ) X_ohe = ct.fit_transform(X) clf_no_cat.set_params(max_depth=2) assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9 clf_no_cat.set_params(max_depth=3) assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
print(f"Number of features: {n_features}") print(f"Number of categorical features: {n_categorical_features}") print(f"Number of numerical features: {n_numerical_features}") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) is_categorical = [True] * n_categorical_features + [False] * n_numerical_features est = HistGradientBoostingClassifier( loss="log_loss", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, categorical_features=is_categorical, early_stopping=False, random_state=0, verbose=verbose, ) fit(est, X_train, y_train, "sklearn") predict(est, X_test, y_test) if args.lightgbm: est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes) est.set_params(max_cat_to_onehot=1) # dont use OHE categorical_features = [ f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat ] fit(est, X_train, y_train, "lightgbm", categorical_feature=categorical_features) predict(est, X_test, y_test)