def test_subsample_heuristic(self):
        """Test subsample=auto heuristic"""
        task = RuleFitC('s=auto')
        X, y =  datasets.make_hastie_10_2(n_samples=112)
        task._modify_parameters(X, y)
        self.assertEqual(task.parameters['subsample'], 1.0)

        task = RuleFitC('s=auto')
        X, y =  datasets.make_hastie_10_2(n_samples=500)
        task._modify_parameters(X, y)
        self.assertEqual(round(task.parameters['subsample'], 2), 0.49)
Example #2
0
    def test_clf_early_stop_gridsearch_weights(self, mocklogloss):
        """Test clf passes weights to the loss function if early-stopping is in effect when doing gridsearch. """
        def weight_loss(actual, pred, weights):
            print "Test"
            if np.all(weights[actual == 1] == 10.) and \
               np.all(weights[actual == -1] == 1.):
                raise ValueError("Weights passed successfully")
            else:
                assert(False)
                return np.sum(pred) - 50.0
        mocklogloss.method = weight_loss
        x, Y = make_hastie_10_2(n_samples=300, random_state=41)
        X = Container()
        X.add(x)
        Z = Partition(X.shape[0], max_reps=2, max_folds=0)
        Z.set(max_reps=1, max_folds=1)
        wt = {'weight': pd.Series(2.0 + 9.0 * (Y == 1).astype(float))}

        # Add weights to container
        X.initialize(wt)
        task = ESGBC('s=1;n=10;md=[2];ls=1;lr=[0.1, 0.000001];t_m=Weighted LogLoss')

        task.fit(X, Y, Z)
        # Assert the patched loss function was passed the weights
        self.assertTrue(mocklogloss.called)
        # The third argument is weight, we should be passed two values
        passed_weights = mocklogloss.call_args[0][2]
        passed_actuals = mocklogloss.call_args[0][0]
        self.assertEqual(len(np.unique(passed_weights)), 2)
        print passed_weights
        self.assertTrue(np.all(passed_weights[passed_actuals == -1] == 2))
        self.assertTrue(np.all(passed_weights[passed_actuals == 1] == 11))
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
Example #4
0
def check_warm_start_oob(name):
    # Test that the warm start computes oob score when asked.
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
    clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
                          random_state=1, bootstrap=True, oob_score=True)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_2.fit(X, y)

    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
    clf_2.fit(X, y)

    assert_true(hasattr(clf_2, 'oob_score_'))
    assert_equal(clf.oob_score_, clf_2.oob_score_)

    # Test that oob_score is computed even if we don't need to train
    # additional trees.
    clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_3.fit(X, y)
    assert_true(not(hasattr(clf_3, 'oob_score_')))

    clf_3.set_params(oob_score=True)
    ignore_warnings(clf_3.fit)(X, y)

    assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_warm_start_smaller_n_estimators(Cls):
    # Test if warm start with smaller n_estimators raises error
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est.fit(X, y)
    est.set_params(n_estimators=99)
    assert_raises(ValueError, est.fit, X, y)
Example #6
0
def check_warm_start(name, random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = ForestEstimator(n_estimators=n_estimators,
                                     random_state=random_state,
                                     warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert_equal(len(clf_ws), n_estimators)

    clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state,
                                warm_start=False)
    clf_no_ws.fit(X, y)

    assert_equal(set([tree.random_state for tree in clf_ws]),
                 set([tree.random_state for tree in clf_no_ws]))

    assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X),
                       err_msg="Failed with {0}".format(name))
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
def test_classification_synthetic():
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    for loss in ('deviance', 'exponential'):

        gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1,
                                          max_depth=1, loss=loss,
                                          learning_rate=1.0, random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.09, \
            "GB(loss={}) failed with error {}".format(loss, error_rate)

        gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1,
                                          max_depth=1,
                                          learning_rate=1.0, subsample=0.5,
                                          random_state=0)
        gbrt.fit(X_train, y_train)
        error_rate = (1.0 - gbrt.score(X_test, y_test))
        assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) "
                                   "failed with error {}".format(loss, error_rate))
def test_warm_start_sparse(Cls):
    # Test that all sparse matrix types are supported
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    sparse_matrix_type = [csr_matrix, csc_matrix, coo_matrix]
    est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5,
                    random_state=1, warm_start=True)
    est_dense.fit(X, y)
    est_dense.predict(X)
    est_dense.set_params(n_estimators=200)
    est_dense.fit(X, y)
    y_pred_dense = est_dense.predict(X)

    for sparse_constructor in sparse_matrix_type:
        X_sparse = sparse_constructor(X)

        est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5,
                         random_state=1, warm_start=True)
        est_sparse.fit(X_sparse, y)
        est_sparse.predict(X)
        est_sparse.set_params(n_estimators=200)
        est_sparse.fit(X_sparse, y)
        y_pred_sparse = est_sparse.predict(X)

        assert_array_almost_equal(est_dense.oob_improvement_[:100],
                                  est_sparse.oob_improvement_[:100])
        assert_array_almost_equal(y_pred_dense, y_pred_sparse)
 def test_most_freq_clf_proba(self):
     X, y = datasets.make_hastie_10_2(random_state=13, n_samples=100)
     prior_pos = (y == 1).mean()
     clf = _DummyClassifier(strategy='most_frequent').fit(X, y)
     proba = clf.predict_proba(X)
     np.testing.assert_array_equal(proba[:, 1], np.ones(X.shape[0]) * prior_pos)
     np.testing.assert_array_equal(proba[:, 0], np.ones(X.shape[0]) * (1 - prior_pos))
def test_monitor_early_stopping(Cls):
    # Test if monitor return value works.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
    est.fit(X, y, monitor=early_stopping_monitor)
    assert_equal(est.n_estimators, 20)  # this is not altered
    assert_equal(est.estimators_.shape[0], 10)
    assert_equal(est.train_score_.shape[0], 10)
    assert_equal(est.oob_improvement_.shape[0], 10)

    # try refit
    est.set_params(n_estimators=30)
    est.fit(X, y)
    assert_equal(est.n_estimators, 30)
    assert_equal(est.estimators_.shape[0], 30)
    assert_equal(est.train_score_.shape[0], 30)

    est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5,
              warm_start=True)
    est.fit(X, y, monitor=early_stopping_monitor)
    assert_equal(est.n_estimators, 20)
    assert_equal(est.estimators_.shape[0], 10)
    assert_equal(est.train_score_.shape[0], 10)
    assert_equal(est.oob_improvement_.shape[0], 10)

    # try refit
    est.set_params(n_estimators=30, warm_start=False)
    est.fit(X, y)
    assert_equal(est.n_estimators, 30)
    assert_equal(est.train_score_.shape[0], 30)
    assert_equal(est.estimators_.shape[0], 30)
    assert_equal(est.oob_improvement_.shape[0], 30)
def test_max_feature_auto():
    # Test if max features is set properly for floats and str.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    _, n_features = X.shape

    X_train = X[:2000]
    y_train = y[:2000]

    gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, n_features)

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(n_features * 0.3))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2')
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, int(np.log2(n_features)))

    gbrt = GradientBoostingRegressor(n_estimators=1,
                                     max_features=0.01 / X.shape[1])
    gbrt.fit(X_train, y_train)
    assert_equal(gbrt.max_features_, 1)
Example #13
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
                                max_features=0.5, random_state=1,
                                bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert_equal(len(estimators_samples), len(estimators))
    assert_equal(len(estimators_samples[0]), len(X) // 2)
    assert_equal(estimators_samples[0].dtype.kind, 'i')

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.coef_

    assert_array_almost_equal(orig_coefs, new_coefs)
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Example #15
0
def test_max_leaf_nodes_max_depth():
    """Test preceedence of max_leaf_nodes over max_depth. """
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    k = 4
    for name, TreeEstimator in ALL_TREES.items():
        est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
        tree = est.tree_
        assert_greater(tree.max_depth, 1)
def test_warm_start_smaller_n_estimators():
    # Test if warm start with smaller n_estimators raises error
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est.fit(X, y)
        est.set_params(n_estimators=99)
        assert_raises(ValueError, est.fit, X, y)
Example #17
0
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert_equal(bagging._max_samples, max_samples)
Example #18
0
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
                                max_features=0.5, oob_score=True,
                                random_state=1)
    assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
Example #19
0
def check_warm_start_smaller_n_estimators(name):
    # Test if warm start second fit with smaller n_estimators raises error.
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
def test_min_impurity_decrease(GBEstimator):
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    est = GBEstimator(min_impurity_decrease=0.1)
    est.fit(X, y)
    for tree in est.estimators_.flat:
        # Simply check if the parameter is passed on correctly. Tree tests
        # will suffice for the actual working of this param
        assert_equal(tree.min_impurity_decrease, 0.1)
Example #21
0
def test_oob_score_removed_on_warm_start():
    X, y = make_hastie_10_2(n_samples=2000, random_state=1)

    clf = BaggingClassifier(n_estimators=50, oob_score=True)
    clf.fit(X, y)

    clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
    clf.fit(X, y)

    assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_complete_classification():
    """Test greedy trees with max_depth + 1 leafs. """
    from sklearn.tree._tree import TREE_LEAF
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    k = 4
    est = GradientBoostingClassifier(n_estimators=20, max_depth=None, random_state=1,
                                     max_leaf_nodes=k+1).fit(X, y)
    tree = est.estimators_[0, 0].tree_
    assert_equal(tree.max_depth, k)
    assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0], k + 1)
def test_min_impurity_split(GBEstimator):
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    est = GBEstimator(min_impurity_split=0.1)
    est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                               est.fit, X, y)
    for tree in est.estimators_.flat:
        assert_equal(tree.min_impurity_split, 0.1)
def test_warm_start_equal_n_estimators(Cls):
    # Test if warm start with equal n_estimators does nothing
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1)
    est.fit(X, y)

    est2 = clone(est)
    est2.set_params(n_estimators=est.n_estimators, warm_start=True)
    est2.fit(X, y)

    assert_array_almost_equal(est2.predict(X), est.predict(X))
def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier]

    for GBEstimator in all_estimators:
        est = GBEstimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_.flat:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1)
Example #26
0
 def _f(x):
     # iris = load_iris()
     X, y = X, y = make_hastie_10_2(random_state=0)
     x = np.ravel(x)
     f = np.zeros(x.shape)
     for i in range(f.size):
         clf = RandomForestClassifier(n_estimators=1, min_samples_leaf=int(np.round(x[i])), random_state=0)
         # scores = cross_val_score(clf, iris.data, iris.target)
         scores = cross_val_score(clf, X, y, cv=5)
         f[i] = -scores.mean()
     return f.ravel()
Example #27
0
def main():

    # generate synthetic binary classification data
    # (name refers to example 10.2 in ESL textbook...see refs below)
    X, y = make_hastie_10_2()

    # perform train/test split (no need to shuffle)
    split_pt = int(TRAIN_PCT * len(X))
    X_train, X_test = X[:split_pt], X[split_pt:]
    y_train, y_test = y[:split_pt], y[split_pt:]

    # single dec stump
    stump_clf = DecisionTreeClassifier(
        max_depth=1)
    stump_clf.fit(X_train, y_train)
    stump_score = round(stump_clf.score(X_test, y_test), 3)
    print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score)

    # single dec tree (max_depth=3)
    tree_clf = DecisionTreeClassifier(max_depth=3)
    tree_clf.fit(X_train, y_train)
    tree_score = round(tree_clf.score(X_test, y_test), 3)
    print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score)

    # gbt: a powerful ensemble technique
    gbt_scores = list()
    for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500):
        print 'fitting gbt for n_estimators = {}...'.format(k)

        gbt_clf = GradientBoostingClassifier(
            n_estimators=k,         # number of weak learners for this iteration
            max_depth=1,            # weak learners are dec stumps
            learning_rate=1.0)      # regularization (shrinkage) hyperparam

        gbt_clf.fit(X_train, y_train)
        gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3))

    print '\ngbt accuracy =\n{}\n'.format(gbt_scores)

    # stochastic gbt (using subsampling)
    sgbt_scores = list()
    for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500):
        print 'fitting sgbt for n_estimators = {}...'.format(k)

        sgbt_clf = GradientBoostingClassifier(
            n_estimators=k,         # number of weak learners for this iteration
            max_depth=1,            # weak learners are dec stumps
            subsample=0.5,          # % of training set used by each bc
            learning_rate=1.0)      # regularization (shrinkage) hyperparam

        sgbt_clf.fit(X_train, y_train)
        sgbt_scores.append(round(sgbt_clf.score(X_test, y_test), 3))

    print '\nsgbt accuracy =\n{}'.format(sgbt_scores)
def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [GradientBoostingRegressor,
                      GradientBoostingClassifier]

    for GBEstimator in all_estimators:
        est = GBEstimator(min_impurity_split=0.1).fit(X, y)
        for tree in est.estimators_.flat:
            assert_equal(tree.min_impurity_split, 0.1)
def test_warm_start_oob_switch(Cls):
    # Test if oob can be turned on during warm start.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est.fit(X, y)
    est.set_params(n_estimators=110, subsample=0.5)
    est.fit(X, y)

    assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
    # the last 10 are not zeros
    assert_array_equal(est.oob_improvement_[-10:] == 0.0,
                       np.zeros(10, dtype=np.bool))
Example #30
0
def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1)
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
                               random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BaggingClassifier(n_estimators=10, warm_start=False,
                            random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
Example #32
0
def check_warm_start_clear(name):
    # Test if fit clears state and grows a new forest when warm_start==False.
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf = ForestEstimator(n_estimators=5,
                          max_depth=1,
                          warm_start=False,
                          random_state=1)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5,
                            max_depth=1,
                            warm_start=True,
                            random_state=2)
    clf_2.fit(X, y)  # inits state
    clf_2.set_params(warm_start=False, random_state=1)
    clf_2.fit(X, y)  # clears old state and equals clf

    assert_array_almost_equal(clf_2.apply(X), clf.apply(X))
Example #33
0
def check_warm_start_oob(name):
    """Test that the warm start computes oob score when asked."""
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
    clf = ForestEstimator(n_estimators=15,
                          max_depth=3,
                          warm_start=False,
                          random_state=1,
                          bootstrap=True,
                          oob_score=True)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5,
                            max_depth=3,
                            warm_start=False,
                            random_state=1,
                            bootstrap=True,
                            oob_score=False)
    clf_2.fit(X, y)

    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
    clf_2.fit(X, y)

    assert_true(hasattr(clf_2, 'oob_score_'))
    assert_equal(clf.oob_score_, clf_2.oob_score_)

    # Test that oob_score is computed even if we don't need to train
    # additional trees.
    clf_3 = ForestEstimator(n_estimators=15,
                            max_depth=3,
                            warm_start=True,
                            random_state=1,
                            bootstrap=True,
                            oob_score=False)
    clf_3.fit(X, y)
    assert_true(not (hasattr(clf_3, 'oob_score_')))

    clf_3.set_params(oob_score=True)
    ignore_warnings(clf_3.fit)(X, y)

    assert_equal(clf.oob_score_, clf_3.oob_score_)
Example #34
0
def test_validation_weights_xgbclassifier():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import make_hastie_10_2

    # prepare training and test data
    X, y = make_hastie_10_2(n_samples=2000, random_state=42)
    labels, y = np.unique(y, return_inverse=True)
    X_train, X_test = X[:1600], X[1600:]
    y_train, y_test = y[:1600], y[1600:]

    # instantiate model
    param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
                  'random_state': 123}
    clf = xgb.sklearn.XGBClassifier(**param_dist)

    # train it using instance weights only in the training set
    weights_train = np.random.choice([1, 2], len(X_train))
    clf.fit(X_train, y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric='logloss',
            verbose=False)

    # evaluate logloss metric on test set *without* using weights
    evals_result_without_weights = clf.evals_result()
    logloss_without_weights = evals_result_without_weights["validation_0"]["logloss"]

    # now use weights for the test set
    np.random.seed(0)
    weights_test = np.random.choice([1, 2], len(X_test))
    clf.fit(X_train, y_train,
            sample_weight=weights_train,
            eval_set=[(X_test, y_test)],
            sample_weight_eval_set=[weights_test],
            eval_metric='logloss',
            verbose=False)
    evals_result_with_weights = clf.evals_result()
    logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]

    # check that the logloss in the test set is actually different when using weights
    # than when not using them
    assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
def test_warm_start():
    # Test if warm start equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=200, max_depth=1)
        est.fit(X, y)

        est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est_ws.fit(X, y)
        est_ws.set_params(n_estimators=200)
        est_ws.fit(X, y)

        if Cls is GradientBoostingRegressor:
            assert_array_almost_equal(est_ws.predict(X), est.predict(X))
        else:
            # Random state is preserved and hence predict_proba must also be
            # same
            assert_array_equal(est_ws.predict(X), est.predict(X))
            assert_array_almost_equal(est_ws.predict_proba(X),
                                      est.predict_proba(X))
Example #36
0
    def test_hastie(self):

        np.random.seed(1)
        n_samples = 200
        test_size = 0.2
        n_est = 100
        max_depth = 10
        lr = 1.0

        X, y = make_hastie_10_2(n_samples)
        poly = PolynomialFeatures()
        X = poly.fit_transform(X)
        y[y < 0] = 0
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size)

        model_palo = PaloBst(distribution="bernoulli",
                             n_estimators=n_est,
                             learning_rate=lr,
                             max_depth=max_depth)
        model_sklr = GradientBoostingClassifier(n_estimators=n_est,
                                                learning_rate=lr,
                                                max_depth=max_depth)
        model_palo.warmup()

        t_start = time.time()
        model_palo.fit(X_train, y_train)
        t_elapsed_palo = time.time() - t_start
        y_hat = model_palo.predict_proba(X_test)[:, 1]
        auc_palo = roc_auc_score(y_test, y_hat)

        t_start = time.time()
        model_sklr.fit(X_train, y_train)
        t_elapsed_sklr = time.time() - t_start
        y_hat = model_sklr.predict_proba(X_test)[:, 1]
        auc_sklr = roc_auc_score(y_test, y_hat)

        print(f"Runtime(PaloBst): {t_elapsed_palo:.3f} seconds")
        print(f"Runtime(sklearn): {t_elapsed_sklr:.3f} seconds")
        print(f"AUROC(PaloBst): {auc_palo:.3f}")
        print(f"AUROC(sklearn): {auc_sklr:.3f}")
Example #37
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)

    # remap the y outside of the BalancedBaggingclassifier
    # _, y = np.unique(y, return_inverse=True)
    bagging = BalancedBaggingClassifier(
        LogisticRegression(solver="lbfgs", multi_class="auto"),
        max_samples=0.5,
        max_features=0.5,
        random_state=1,
        bootstrap=False,
    )
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X) // 2
    assert estimators_samples[0].dtype.kind == "i"

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.steps[-1][1].coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.steps[-1][1].coef_

    assert_allclose(orig_coefs, new_coefs)
Example #38
0
def show():
    # 设置AdaBoost迭代次数
    n_estimators = 200
    # 使用
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    # 从12000个数据中取前2000行作为测试集,其余作为训练集
    train_x, train_y = X[2000:], y[2000:]
    test_x, test_y = X[:2000], y[:2000]
    # 弱分类器
    dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
    dt_stump.fit(train_x, train_y)
    dt_stump_err = 1.0 - dt_stump.score(test_x, test_y)
    # 决策树分类器
    dt = DecisionTreeClassifier()
    dt.fit(train_x, train_y)
    dt_err = 1.0 - dt.score(test_x, test_y)
    # AdaBoost分类器
    ada = AdaBoostClassifier(base_estimator=dt_stump,
                             n_estimators=n_estimators)
    ada.fit(train_x, train_y)
    # 三个分类器的错误率可视化
    fig = plt.figure()
    # 设置plt正确显示中文
    plt.rcParams['font.sans-serif'] = ['SimHei']
    ax = fig.add_subplot(111)
    ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率')
    ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率')
    ada_err = np.zeros((n_estimators, ))
    # 遍历每次迭代的结果 i为迭代次数, pred_y为预测结果
    for i, pred_y in enumerate(ada.staged_predict(test_x)):
        # 统计错误率
        ada_err[i] = zero_one_loss(pred_y, test_y)
    # 绘制每次迭代的AdaBoost错误率
    ax.plot(np.arange(n_estimators) + 1,
            ada_err,
            label='AdaBoost Test 错误率',
            color='orange')
    ax.set_xlabel('迭代次数')
    ax.set_ylabel('错误率')
    leg = ax.legend(loc='upper right', fancybox=True)
    plt.show()
Example #39
0
def test():

    X, y = make_hastie_10_2(n_samples=100000)
    y[y == -1.0] = 0.0  # AlphaTree accepts [0, 1] not [-1, 1]
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    models = {
        "alpha_1-c45": C45Tree(max_depth=10),
        "alpha_2-cart": GiniTree(max_depth=10),
        "alpha_3": AlphaTree(alpha=3.0, max_depth=10),
        "sklearn": DecisionTreeClassifier(max_depth=10),
    }

    print("\n")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   auc    ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       0.5))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict(X_test)
        time_pred = time.time() - start

        # Error
        auc = roc_auc_score(y_test, y_hat)
        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, auc))

    print("-----------------------------------------------------")
    print("\n")
Example #40
0
def test_monitor_early_stopping():
    """Test if monitor return value works. """
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
        est.fit(X, y, monitor=early_stopping_monitor)
        assert_equal(est.n_estimators, 20)  # this is not altered
        assert_equal(est.estimators_.shape[0], 10)
        assert_equal(est.train_score_.shape[0], 10)
        assert_equal(est.oob_improvement_.shape[0], 10)
        assert_equal(est._oob_score_.shape[0], 10)

        # try refit
        est.set_params(n_estimators=30)
        est.fit(X, y)
        assert_equal(est.n_estimators, 30)
        assert_equal(est.estimators_.shape[0], 30)
        assert_equal(est.train_score_.shape[0], 30)
        assert_equal(est.oob_improvement_.shape[0], 30)

        est = Cls(n_estimators=20,
                  max_depth=1,
                  random_state=1,
                  subsample=0.5,
                  warm_start=True)
        est.fit(X, y, monitor=early_stopping_monitor)
        assert_equal(est.n_estimators, 20)
        assert_equal(est.estimators_.shape[0], 10)
        assert_equal(est.train_score_.shape[0], 10)
        assert_equal(est.oob_improvement_.shape[0], 10)
        assert_equal(est._oob_score_.shape[0], 10)

        # try refit
        est.set_params(n_estimators=30, warm_start=False)
        est.fit(X, y)
        assert_equal(est.n_estimators, 30)
        assert_equal(est.train_score_.shape[0], 30)
        assert_equal(est.estimators_.shape[0], 30)
        assert_equal(est.oob_improvement_.shape[0], 30)
Example #41
0
def test():
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    X = X.astype(np.float32)

    labels, y = np.unique(y, return_inverse=True)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    df = pd.DataFrame(X_train)
    print df.describe()
    s = pd.Series(y_train)
    s = s.astype('str')
    print s.describe()

    clf = ensemble.GradientBoostingClassifier(n_estimators=50)
    clf.fit(X_train, y_train)

    analyze_clf(clf)

    print clf.score(X_train, y_train)
    print clf.score(X_test, y_test)
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BaggingClassifier(n_estimators=n_estimators,
                                       random_state=random_state,
                                       warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
                                  warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([tree.random_state for tree in clf_ws]) ==
                 set([tree.random_state for tree in clf_no_ws]))
Example #43
0
def test_classification_synthetic(loss):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert error_rate < 0.09

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert error_rate < 0.08
Example #44
0
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = SerialBaggingClassifier(n_estimators=5,
                                  warm_start=True,
                                  random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.0

    assert_warns_message(
        UserWarning,
        "Warm-start fitting without increasing n_estimators does not",
        clf.fit,
        X_train,
        y_train,
    )
    assert_array_equal(y_pred, clf.predict(X_test))
Example #45
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct
    X, y = make_hastie_10_2(n_samples=100, random_state=1)

    # remap the y outside of the SMOTEBagging
    # _, y = np.unique(y, return_inverse=True)
    bagging = SMOTEBagging(LogisticRegression(),
                           max_samples=0.5,
                           max_features=0.5,
                           random_state=0,
                           bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X)
    assert estimators_samples[0].dtype.kind == 'b'
Example #46
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(
        LogisticRegression(),
        max_samples=0.5,
        max_features=0.5,
        random_state=1,
        bootstrap=False,
    )
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X) // 2
    assert estimators_samples[0].dtype.kind == "i"

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.coef_

    assert_array_almost_equal(orig_coefs, new_coefs)
Example #47
0
def check_warm_start_equal_n_estimators(name):
    """Test if warm start with equal n_estimators does nothing and returns the
    same forest and raises a warning."""
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf = ForestEstimator(n_estimators=5,
                          max_depth=3,
                          warm_start=True,
                          random_state=1)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5,
                            max_depth=3,
                            warm_start=True,
                            random_state=1)
    clf_2.fit(X, y)
    # Now clf_2 equals clf.

    clf_2.set_params(random_state=2)
    assert_warns(UserWarning, clf_2.fit, X, y)
    # If we had fit the trees again we would have got a different forest as we
    # changed the random state.
    assert_array_equal(clf.apply(X), clf_2.apply(X))
Example #48
0
def main():

    # generate synthetic binary classification data
    # (name refers to example 10.2 in ESL textbook...see refs below)
    X, y = make_hastie_10_2()

    # perform train/test split (no need to shuffle)
    split_pt = int(TRAIN_PCT * len(X))
    X_train, X_test = X[:split_pt], X[split_pt:]
    y_train, y_test = y[:split_pt], y[split_pt:]

    # single dec stump
    stump_clf = DecisionTreeClassifier(max_depth=1)
    stump_clf.fit(X_train, y_train)
    stump_score = round(stump_clf.score(X_test, y_test), 3)
    print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score)

    # single dec tree (max_depth=3)
    tree_clf = DecisionTreeClassifier(max_depth=5)
    tree_clf.fit(X_train, y_train)
    tree_score = round(tree_clf.score(X_test, y_test), 3)
    print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score)

    # gbt: a powerful ensemble technique
    gbt_scores = list()
    for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500):
        print 'fitting gbt for n_estimators = {}...'.format(k)

        gbt_clf = GradientBoostingClassifier(
            n_estimators=k,  # number of weak learners for this iteration
            max_depth=1,  # weak learners are dec stumps
            learning_rate=1.0)  # regularization (shrinkage) hyperparam

        gbt_clf.fit(X_train, y_train)
        gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3))

    print '\ngbt accuracy =\n{}'.format(gbt_scores)
Example #49
0
def train_onnx():
    # Example BDT creation from: https://scikit-learn.org/stable/modules/ensemble.html
    from sklearn.datasets import make_hastie_10_2
    from sklearn.ensemble import GradientBoostingClassifier

    # Make a random dataset from onnxearn 'hastie'
    X, y = make_hastie_10_2(random_state=0)
    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    # Train a BDT
    clf = GradientBoostingClassifier(n_estimators=20,
                                     learning_rate=1.0,
                                     max_depth=3,
                                     random_state=0).fit(X_train, y_train)

    from skl2onnx.common.data_types import FloatTensorType
    initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
    clf = onnxmltools.convert_sklearn(clf,
                                      'Hastie model',
                                      initial_types=initial_type)
    onnx.save(clf, 'hastie_bdt.onnx')

    return clf, X_test, y_test, 'hastie_bdt.onnx'
def test_classification_synthetic():
    """Test GradientBoostingClassifier on synthetic dataset used by
    Hastie et al. in ESLII Example 12.7. """
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1,
                                      max_depth=1,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert error_rate < 0.085, \
        "GB failed with error %.4f" % error_rate

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1,
                                      max_depth=1,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert error_rate < 0.08, \
        "Stochastic GB failed with error %.4f" % error_rate
Example #51
0
def check_decision_path(name):
    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
    n_samples = X.shape[0]
    ForestEstimator = FOREST_ESTIMATORS[name]
    est = ForestEstimator(n_estimators=5,
                          max_depth=1,
                          warm_start=False,
                          random_state=1)
    est.fit(X, y)
    indicator, n_nodes_ptr = est.decision_path(X)

    assert_equal(indicator.shape[1], n_nodes_ptr[-1])
    assert_equal(indicator.shape[0], n_samples)
    assert_array_equal(np.diff(n_nodes_ptr),
                       [e.tree_.node_count for e in est.estimators_])

    # Assert that leaves index are correct
    leaves = est.apply(X)
    for est_id in range(leaves.shape[1]):
        leave_indicator = [
            indicator[i, n_nodes_ptr[est_id] + j]
            for i, j in enumerate(leaves[:, est_id])
        ]
        assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
#          Raghav RV <*****@*****.**>
# License: BSD 3 clause

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.model_selection import train_test_split

data_list = [
    datasets.load_iris(return_X_y=True),
    datasets.make_classification(n_samples=800, random_state=0),
    datasets.make_hastie_10_2(n_samples=2000, random_state=0),
]
names = ["Iris Data", "Classification Data", "Hastie Data"]

n_gb = []
score_gb = []
time_gb = []
n_gbes = []
score_gbes = []
time_gbes = []

n_estimators = 200

for X, y in data_list:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
Example #53
0
# License: BSD 3 clause

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.model_selection import train_test_split

print(__doc__)

data_list = [datasets.load_iris(), datasets.load_digits()]
data_list = [(d.data, d.target) for d in data_list]
data_list += [datasets.make_hastie_10_2()]
names = ['Iris Data', 'Digits Data', 'Hastie Data']

n_gb = []
score_gb = []
time_gb = []
n_gbes = []
score_gbes = []
time_gbes = []

n_estimators = 500

for X, y in data_list:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
Example #54
0
def test_min_weight_fraction_leaf():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    X = X.astype(np.float32)
    for name in FOREST_ESTIMATORS:
        yield check_min_weight_fraction_leaf, name, X, y
Example #55
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import zero_one_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# 设置 AdaBoost 迭代次数
n_estimators = 200
# 使用
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
# 从 12000 个数据中取前 2000 行作为测试集,其余作为训练集
test_x, test_y = X[2000:], y[2000:]
train_x, train_y = X[:2000], y[:2000]
# 弱分类器
dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
dt_stump.fit(train_x, train_y)
dt_stump_err = 1.0 - dt_stump.score(test_x, test_y)
# 决策树分类器
dt = DecisionTreeClassifier()
dt.fit(train_x, train_y)
dt_err = 1.0 - dt.score(test_x, test_y)
# AdaBoost 分类器
ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators)
ada.fit(train_x, train_y)
# 三个分类器的错误率可视化
fig = plt.figure()
# 设置 plt 正确显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
ax = fig.add_subplot(111)
ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率')
Example #56
0
    return error_rate(pred_train,y_train), error_rate(pred_test,y_test)

#plot error curve
def plot_error_rate(er_train,er_test):
    df_err =pd.DataFrame([er_train,er_test]).T
    df_err.columns = ["Train","Test"]
    plot1 = df_err.plot(linewidth =3,figsize =(8,6),
                        color = ["lightblue","darkblue"],grid = True)
    plot1.set_xlabel('Number of iterations', fontsize=12)
    plot1.set_xticklabels(range(0, 450, 50))
    plot1.set_ylabel('Error rate', fontsize=12)
    plot1.set_title('Error rate vs number of iterations', fontsize=16)
    plt.axhline(y=er_test[0], linewidth=1, color='red', ls='dashed')

#just do it
if __name__ =='__main__':
    x,y  = make_hastie_10_2()
    X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
    svm_clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
                      max_iter=-1, probability=False, random_state=None, shrinking=True,
                      tol=0.001, verbose=False)
    er_train,er_test= initclf(X_train,y_train,X_test,y_test,svm_clf)
    er_train,er_test= [er_train],[er_test]
    xrange =(10,410,10)
    for i in xrange:
        er_train_i,er_test_i = adaboost(X_train,y_train,X_test,y_test,i,svm_clf)
        er_train.append(er_train_i)
        er_test.append(er_test_i)
    plot_error_rate(er_train,er_test)
Example #57
0
# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

# Make regression dataset
X_reg, y_reg = datasets.make_regression(n_samples=500,
                                        n_features=10,
                                        random_state=1)

# also make a hastie_10_2 dataset
hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
hastie_X = hastie_X.astype(np.float32)

# Get the default backend in joblib to test parallelism and interaction with
# different backends
DEFAULT_JOBLIB_BACKEND = joblib.parallel.get_active_backend()[0].__class__

FOREST_CLASSIFIERS = {
    "ExtraTreesClassifier": ExtraTreesClassifier,
    "RandomForestClassifier": RandomForestClassifier,
}

FOREST_REGRESSORS = {
    "ExtraTreesRegressor": ExtraTreesRegressor,
    "RandomForestRegressor": RandomForestRegressor,
}
Example #58
0
def test_max_leaf_nodes_max_depth():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for name in FOREST_ESTIMATORS:
        yield check_max_leaf_nodes_max_depth, name, X, y
Example #59
0
def test_warm_start_with_oob_score_fails():
    # Check using oob_score and warm_start simultaneously fails
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Example #60
0
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
            sample_score_std = results['std_%s_%s' % (sample, scorer)]
            ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                            sample_score_mean + sample_score_std,
                            alpha=0.1 if sample == 'test' else 0, color=color)
            ax.plot(X_axis, sample_score_mean, style, color=color,
                    alpha=1 if sample == 'test' else 0.7,
                    label="%s (%s)" % (scorer, sample))
    
        best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
        best_score = results['mean_test_%s' % scorer][best_index]
    
        # Plot a dotted vertical line at the best score for that scorer marked by x
        ax.plot([X_axis[best_index], ] * 2, [0, best_score],
                linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)
    
        # Annotate the best score for that scorer
        ax.annotate("%0.2f" % best_score,
                    (X_axis[best_index], best_score + 0.005))
    
    plt.legend(loc="best")
    plt.grid(False)
    plt.show()
    
if __name__ == '__main__':
    X, y = make_hastie_10_2(n_samples=8000, random_state=42)
    clf = DecisionTreeClassifier(random_state=42)
    title = 'Decision Tree Classifier using multiple scorers simultaneously'
    params = {'max_depth': range(1, 52, 2)}
    GridSearch(clf, X, y, title, params)