def test_early_stopping_on_test_set_with_warm_start():
    # Non regression test for #16661 where second fit fails with
    # warm_start=True, early_stopping is on, and no validation set
    X, y = make_classification(random_state=0)
    gb = HistGradientBoostingClassifier(
        max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
        n_iter_no_change=1, validation_fraction=None)

    gb.fit(X, y)
    # does not raise on second call
    gb.set_params(max_iter=2)
    gb.fit(X, y)
Exemple #2
0
def test_categorical_encoding_strategies():
    # Check native categorical handling vs different encoding strategies. We
    # make sure that native encoding needs only 1 split to achieve a perfect
    # prediction on a simple dataset. In contrast, OneHotEncoded data needs
    # more depth / splits, and treating categories as ordered (just using
    # OrdinalEncoder) requires even more depth.

    # dataset with one random continuous feature, and one categorical feature
    # with values in [0, 5], e.g. from an OrdinalEncoder.
    # class == 1 iff categorical value in {0, 2, 4}
    rng = np.random.RandomState(0)
    n_samples = 10_000
    f1 = rng.rand(n_samples)
    f2 = rng.randint(6, size=n_samples)
    X = np.c_[f1, f2]
    y = np.zeros(shape=n_samples)
    y[X[:, 1] % 2 == 0] = 1

    # make sure dataset is balanced so that the baseline_prediction doesn't
    # influence predictions too much with max_iter = 1
    assert 0.49 < y.mean() < 0.51

    clf_cat = HistGradientBoostingClassifier(
        max_iter=1, max_depth=1, categorical_features=[False, True]
    )

    # Using native categorical encoding, we get perfect predictions with just
    # one split
    assert cross_val_score(clf_cat, X, y).mean() == 1

    # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21
    expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0]
    left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0]
    assert_array_equal(left_bitset, expected_left_bitset)

    # Treating categories as ordered, we need more depth / more splits to get
    # the same predictions
    clf_no_cat = HistGradientBoostingClassifier(
        max_iter=1, max_depth=4, categorical_features=None
    )
    assert cross_val_score(clf_no_cat, X, y).mean() < 0.9

    clf_no_cat.set_params(max_depth=5)
    assert cross_val_score(clf_no_cat, X, y).mean() == 1

    # Using OHEd data, we need less splits than with pure OEd data, but we
    # still need more splits than with the native categorical splits
    ct = make_column_transformer(
        (OneHotEncoder(sparse=False), [1]), remainder="passthrough"
    )
    X_ohe = ct.fit_transform(X)
    clf_no_cat.set_params(max_depth=2)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9

    clf_no_cat.set_params(max_depth=3)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
Exemple #3
0
print(f"Number of features: {n_features}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

is_categorical = [True] * n_categorical_features + [False] * n_numerical_features
est = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=lr,
    max_iter=n_trees,
    max_bins=max_bins,
    max_leaf_nodes=n_leaf_nodes,
    categorical_features=is_categorical,
    early_stopping=False,
    random_state=0,
    verbose=verbose,
)

fit(est, X_train, y_train, "sklearn")
predict(est, X_test, y_test)

if args.lightgbm:
    est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
    est.set_params(max_cat_to_onehot=1)  # dont use OHE
    categorical_features = [
        f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat
    ]
    fit(est, X_train, y_train, "lightgbm", categorical_feature=categorical_features)
    predict(est, X_test, y_test)