def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline(FunctionTransformer(replace), classifier) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert y.shape == y_hat.shape bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def test_base(): # Check BaseEnsemble methods. ensemble = BaggingClassifier( base_estimator=Perceptron(random_state=None), n_estimators=3) iris = load_iris() ensemble.fit(iris.data, iris.target) ensemble.estimators_ = [] # empty the list and create estimators manually ensemble._make_estimator() random_state = np.random.RandomState(3) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(append=False) assert 3 == len(ensemble) assert 3 == len(ensemble.estimators_) assert isinstance(ensemble[0], Perceptron) assert ensemble[0].random_state is None assert isinstance(ensemble[1].random_state, int) assert isinstance(ensemble[2].random_state, int) assert ensemble[1].random_state != ensemble[2].random_state np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=np.int32(3)) np_int_ensemble.fit(iris.data, iris.target)
def test_bagging_small_max_features(): # Check that Bagging estimator can accept low fractional max_features X = np.array([[1, 2], [3, 4]]) y = np.array([1, 0]) bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) bagging.fit(X, y)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_bagging_sample_weight_unsupported_but_passed(): estimator = BaggingClassifier(DummyZeroEstimator()) rng = check_random_state(0) estimator.fit(iris.data, iris.target).predict(iris.data) assert_raises(ValueError, estimator.fit, iris.data, iris.target, sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert bagging._max_samples == max_samples
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message( UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([tree.random_state for tree in clf_ws ]) == set([tree.random_state for tree in clf_no_ws]))
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == 'i' # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.coef_ estimator.fit(X_train, y_train) new_coefs = estimator.coef_ assert_array_almost_equal(orig_coefs, new_coefs)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert isinstance(estimator[0].steps[-1][1].random_state, int)