def test_subsample_heuristic(self): """Test subsample=auto heuristic""" task = RuleFitC('s=auto') X, y = datasets.make_hastie_10_2(n_samples=112) task._modify_parameters(X, y) self.assertEqual(task.parameters['subsample'], 1.0) task = RuleFitC('s=auto') X, y = datasets.make_hastie_10_2(n_samples=500) task._modify_parameters(X, y) self.assertEqual(round(task.parameters['subsample'], 2), 0.49)
def test_clf_early_stop_gridsearch_weights(self, mocklogloss): """Test clf passes weights to the loss function if early-stopping is in effect when doing gridsearch. """ def weight_loss(actual, pred, weights): print "Test" if np.all(weights[actual == 1] == 10.) and \ np.all(weights[actual == -1] == 1.): raise ValueError("Weights passed successfully") else: assert(False) return np.sum(pred) - 50.0 mocklogloss.method = weight_loss x, Y = make_hastie_10_2(n_samples=300, random_state=41) X = Container() X.add(x) Z = Partition(X.shape[0], max_reps=2, max_folds=0) Z.set(max_reps=1, max_folds=1) wt = {'weight': pd.Series(2.0 + 9.0 * (Y == 1).astype(float))} # Add weights to container X.initialize(wt) task = ESGBC('s=1;n=10;md=[2];ls=1;lr=[0.1, 0.000001];t_m=Weighted LogLoss') task.fit(X, Y, Z) # Assert the patched loss function was passed the weights self.assertTrue(mocklogloss.called) # The third argument is weight, we should be passed two values passed_weights = mocklogloss.call_args[0][2] passed_actuals = mocklogloss.call_args[0][0] self.assertEqual(len(np.unique(passed_weights)), 2) print passed_weights self.assertTrue(np.all(passed_weights[passed_actuals == -1] == 2)) self.assertTrue(np.all(passed_weights[passed_actuals == 1] == 11))
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def check_warm_start_oob(name): # Test that the warm start computes oob score when asked. X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not(hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_warm_start_smaller_n_estimators(Cls): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=99) assert_raises(ValueError, est.fit, X, y)
def check_warm_start(name, random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws])) assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), err_msg="Failed with {0}".format(name))
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_classification_synthetic(): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] for loss in ('deviance', 'exponential'): gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.09, \ "GB(loss={}) failed with error {}".format(loss, error_rate) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) " "failed with error {}".format(loss, error_rate))
def test_warm_start_sparse(Cls): # Test that all sparse matrix types are supported X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) sparse_matrix_type = [csr_matrix, csc_matrix, coo_matrix] est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True) est_dense.fit(X, y) est_dense.predict(X) est_dense.set_params(n_estimators=200) est_dense.fit(X, y) y_pred_dense = est_dense.predict(X) for sparse_constructor in sparse_matrix_type: X_sparse = sparse_constructor(X) est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True) est_sparse.fit(X_sparse, y) est_sparse.predict(X) est_sparse.set_params(n_estimators=200) est_sparse.fit(X_sparse, y) y_pred_sparse = est_sparse.predict(X) assert_array_almost_equal(est_dense.oob_improvement_[:100], est_sparse.oob_improvement_[:100]) assert_array_almost_equal(y_pred_dense, y_pred_sparse)
def test_most_freq_clf_proba(self): X, y = datasets.make_hastie_10_2(random_state=13, n_samples=100) prior_pos = (y == 1).mean() clf = _DummyClassifier(strategy='most_frequent').fit(X, y) proba = clf.predict_proba(X) np.testing.assert_array_equal(proba[:, 1], np.ones(X.shape[0]) * prior_pos) np.testing.assert_array_equal(proba[:, 0], np.ones(X.shape[0]) * (1 - prior_pos))
def test_monitor_early_stopping(Cls): # Test if monitor return value works. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) est.fit(X, y, monitor=early_stopping_monitor) assert_equal(est.n_estimators, 20) # this is not altered assert_equal(est.estimators_.shape[0], 10) assert_equal(est.train_score_.shape[0], 10) assert_equal(est.oob_improvement_.shape[0], 10) # try refit est.set_params(n_estimators=30) est.fit(X, y) assert_equal(est.n_estimators, 30) assert_equal(est.estimators_.shape[0], 30) assert_equal(est.train_score_.shape[0], 30) est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True) est.fit(X, y, monitor=early_stopping_monitor) assert_equal(est.n_estimators, 20) assert_equal(est.estimators_.shape[0], 10) assert_equal(est.train_score_.shape[0], 10) assert_equal(est.oob_improvement_.shape[0], 10) # try refit est.set_params(n_estimators=30, warm_start=False) est.fit(X, y) assert_equal(est.n_estimators, 30) assert_equal(est.train_score_.shape[0], 30) assert_equal(est.estimators_.shape[0], 30) assert_equal(est.oob_improvement_.shape[0], 30)
def test_max_feature_auto(): # Test if max features is set properly for floats and str. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) _, n_features = X.shape X_train = X[:2000] y_train = y[:2000] gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, n_features) gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3) gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(n_features * 0.3)) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.log2(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1]) gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, 1)
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert_equal(len(estimators_samples), len(estimators)) assert_equal(len(estimators_samples[0]), len(X) // 2) assert_equal(estimators_samples[0].dtype.kind, 'i') # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.coef_ estimator.fit(X_train, y_train) new_coefs = estimator.coef_ assert_array_almost_equal(orig_coefs, new_coefs)
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y)
def test_max_leaf_nodes_max_depth(): """Test preceedence of max_leaf_nodes over max_depth. """ X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) tree = est.tree_ assert_greater(tree.max_depth, 1)
def test_warm_start_smaller_n_estimators(): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=99) assert_raises(ValueError, est.fit, X, y)
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert_equal(bagging._max_samples, max_samples)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
def check_warm_start_smaller_n_estimators(name): # Test if warm start second fit with smaller n_estimators raises error. X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_min_impurity_decrease(GBEstimator): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = GBEstimator(min_impurity_decrease=0.1) est.fit(X, y) for tree in est.estimators_.flat: # Simply check if the parameter is passed on correctly. Tree tests # will suffice for the actual working of this param assert_equal(tree.min_impurity_decrease, 0.1)
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_complete_classification(): """Test greedy trees with max_depth + 1 leafs. """ from sklearn.tree._tree import TREE_LEAF X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 est = GradientBoostingClassifier(n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k+1).fit(X, y) tree = est.estimators_[0, 0].tree_ assert_equal(tree.max_depth, k) assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0], k + 1)
def test_min_impurity_split(GBEstimator): # Test if min_impurity_split of base estimators is set # Regression test for #8006 X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = GBEstimator(min_impurity_split=0.1) est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", est.fit, X, y) for tree in est.estimators_.flat: assert_equal(tree.min_impurity_split, 0.1)
def test_warm_start_equal_n_estimators(Cls): # Test if warm start with equal n_estimators does nothing X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = Cls(n_estimators=100, max_depth=1) est.fit(X, y) est2 = clone(est) est2.set_params(n_estimators=est.n_estimators, warm_start=True) est2.fit(X, y) assert_array_almost_equal(est2.predict(X), est.predict(X))
def test_min_impurity_decrease(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] for GBEstimator in all_estimators: est = GBEstimator(min_impurity_decrease=0.1) est.fit(X, y) for tree in est.estimators_.flat: # Simply check if the parameter is passed on correctly. Tree tests # will suffice for the actual working of this param assert_equal(tree.min_impurity_decrease, 0.1)
def _f(x): # iris = load_iris() X, y = X, y = make_hastie_10_2(random_state=0) x = np.ravel(x) f = np.zeros(x.shape) for i in range(f.size): clf = RandomForestClassifier(n_estimators=1, min_samples_leaf=int(np.round(x[i])), random_state=0) # scores = cross_val_score(clf, iris.data, iris.target) scores = cross_val_score(clf, X, y, cv=5) f[i] = -scores.mean() return f.ravel()
def main(): # generate synthetic binary classification data # (name refers to example 10.2 in ESL textbook...see refs below) X, y = make_hastie_10_2() # perform train/test split (no need to shuffle) split_pt = int(TRAIN_PCT * len(X)) X_train, X_test = X[:split_pt], X[split_pt:] y_train, y_test = y[:split_pt], y[split_pt:] # single dec stump stump_clf = DecisionTreeClassifier( max_depth=1) stump_clf.fit(X_train, y_train) stump_score = round(stump_clf.score(X_test, y_test), 3) print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score) # single dec tree (max_depth=3) tree_clf = DecisionTreeClassifier(max_depth=3) tree_clf.fit(X_train, y_train) tree_score = round(tree_clf.score(X_test, y_test), 3) print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score) # gbt: a powerful ensemble technique gbt_scores = list() for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500): print 'fitting gbt for n_estimators = {}...'.format(k) gbt_clf = GradientBoostingClassifier( n_estimators=k, # number of weak learners for this iteration max_depth=1, # weak learners are dec stumps learning_rate=1.0) # regularization (shrinkage) hyperparam gbt_clf.fit(X_train, y_train) gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3)) print '\ngbt accuracy =\n{}\n'.format(gbt_scores) # stochastic gbt (using subsampling) sgbt_scores = list() for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500): print 'fitting sgbt for n_estimators = {}...'.format(k) sgbt_clf = GradientBoostingClassifier( n_estimators=k, # number of weak learners for this iteration max_depth=1, # weak learners are dec stumps subsample=0.5, # % of training set used by each bc learning_rate=1.0) # regularization (shrinkage) hyperparam sgbt_clf.fit(X_train, y_train) sgbt_scores.append(round(sgbt_clf.score(X_test, y_test), 3)) print '\nsgbt accuracy =\n{}'.format(sgbt_scores)
def test_min_impurity_split(): # Test if min_impurity_split of base estimators is set # Regression test for #8006 X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] for GBEstimator in all_estimators: est = GBEstimator(min_impurity_split=0.1).fit(X, y) for tree in est.estimators_.flat: assert_equal(tree.min_impurity_split, 0.1)
def test_warm_start_oob_switch(Cls): # Test if oob can be turned on during warm start. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=110, subsample=0.5) est.fit(X, y) assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) # the last 10 are not zeros assert_array_equal(est.oob_improvement_[-10:] == 0.0, np.zeros(10, dtype=np.bool))
def test_min_impurity_decrease(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor] for Estimator in all_estimators: est = Estimator(min_impurity_decrease=0.1) est.fit(X, y) for tree in est.estimators_: # Simply check if the parameter is passed on correctly. Tree tests # will suffice for the actual working of this param assert_equal(tree.min_impurity_decrease, 0.1)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def check_warm_start_clear(name): # Test if fit clears state and grows a new forest when warm_start==False. X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True, random_state=2) clf_2.fit(X, y) # inits state clf_2.set_params(warm_start=False, random_state=1) clf_2.fit(X, y) # clears old state and equals clf assert_array_almost_equal(clf_2.apply(X), clf.apply(X))
def check_warm_start_oob(name): """Test that the warm start computes oob score when asked.""" X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not (hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_validation_weights_xgbclassifier(): tm._skip_if_no_sklearn() from sklearn.datasets import make_hastie_10_2 # prepare training and test data X, y = make_hastie_10_2(n_samples=2000, random_state=42) labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:1600], X[1600:] y_train, y_test = y[:1600], y[1600:] # instantiate model param_dist = {'objective': 'binary:logistic', 'n_estimators': 2, 'random_state': 123} clf = xgb.sklearn.XGBClassifier(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) clf.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], eval_metric='logloss', verbose=False) # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"]["logloss"] # now use weights for the test set np.random.seed(0) weights_test = np.random.choice([1, 2], len(X_test)) clf.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], sample_weight_eval_set=[weights_test], eval_metric='logloss', verbose=False) evals_result_with_weights = clf.evals_result() logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] # check that the logloss in the test set is actually different when using weights # than when not using them assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
def test_warm_start(): # Test if warm start equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=200, max_depth=1) est.fit(X, y) est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) est_ws.fit(X, y) est_ws.set_params(n_estimators=200) est_ws.fit(X, y) if Cls is GradientBoostingRegressor: assert_array_almost_equal(est_ws.predict(X), est.predict(X)) else: # Random state is preserved and hence predict_proba must also be # same assert_array_equal(est_ws.predict(X), est.predict(X)) assert_array_almost_equal(est_ws.predict_proba(X), est.predict_proba(X))
def test_hastie(self): np.random.seed(1) n_samples = 200 test_size = 0.2 n_est = 100 max_depth = 10 lr = 1.0 X, y = make_hastie_10_2(n_samples) poly = PolynomialFeatures() X = poly.fit_transform(X) y[y < 0] = 0 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size) model_palo = PaloBst(distribution="bernoulli", n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_sklr = GradientBoostingClassifier(n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_palo.warmup() t_start = time.time() model_palo.fit(X_train, y_train) t_elapsed_palo = time.time() - t_start y_hat = model_palo.predict_proba(X_test)[:, 1] auc_palo = roc_auc_score(y_test, y_hat) t_start = time.time() model_sklr.fit(X_train, y_train) t_elapsed_sklr = time.time() - t_start y_hat = model_sklr.predict_proba(X_test)[:, 1] auc_sklr = roc_auc_score(y_test, y_hat) print(f"Runtime(PaloBst): {t_elapsed_palo:.3f} seconds") print(f"Runtime(sklearn): {t_elapsed_sklr:.3f} seconds") print(f"AUROC(PaloBst): {auc_palo:.3f}") print(f"AUROC(sklearn): {auc_sklr:.3f}")
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier( LogisticRegression(solver="lbfgs", multi_class="auto"), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False, ) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs)
def show(): # 设置AdaBoost迭代次数 n_estimators = 200 # 使用 X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) # 从12000个数据中取前2000行作为测试集,其余作为训练集 train_x, train_y = X[2000:], y[2000:] test_x, test_y = X[:2000], y[:2000] # 弱分类器 dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(train_x, train_y) dt_stump_err = 1.0 - dt_stump.score(test_x, test_y) # 决策树分类器 dt = DecisionTreeClassifier() dt.fit(train_x, train_y) dt_err = 1.0 - dt.score(test_x, test_y) # AdaBoost分类器 ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators) ada.fit(train_x, train_y) # 三个分类器的错误率可视化 fig = plt.figure() # 设置plt正确显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率') ada_err = np.zeros((n_estimators, )) # 遍历每次迭代的结果 i为迭代次数, pred_y为预测结果 for i, pred_y in enumerate(ada.staged_predict(test_x)): # 统计错误率 ada_err[i] = zero_one_loss(pred_y, test_y) # 绘制每次迭代的AdaBoost错误率 ax.plot(np.arange(n_estimators) + 1, ada_err, label='AdaBoost Test 错误率', color='orange') ax.set_xlabel('迭代次数') ax.set_ylabel('错误率') leg = ax.legend(loc='upper right', fancybox=True) plt.show()
def test(): X, y = make_hastie_10_2(n_samples=100000) y[y == -1.0] = 0.0 # AlphaTree accepts [0, 1] not [-1, 1] n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) models = { "alpha_1-c45": C45Tree(max_depth=10), "alpha_2-cart": GiniTree(max_depth=10), "alpha_3": AlphaTree(alpha=3.0, max_depth=10), "sklearn": DecisionTreeClassifier(max_depth=10), } print("\n") print("-----------------------------------------------------") print(" model_name train_time predict_time auc ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", 0.5)) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict(X_test) time_pred = time.time() - start # Error auc = roc_auc_score(y_test, y_hat) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, auc)) print("-----------------------------------------------------") print("\n")
def test_monitor_early_stopping(): """Test if monitor return value works. """ X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) est.fit(X, y, monitor=early_stopping_monitor) assert_equal(est.n_estimators, 20) # this is not altered assert_equal(est.estimators_.shape[0], 10) assert_equal(est.train_score_.shape[0], 10) assert_equal(est.oob_improvement_.shape[0], 10) assert_equal(est._oob_score_.shape[0], 10) # try refit est.set_params(n_estimators=30) est.fit(X, y) assert_equal(est.n_estimators, 30) assert_equal(est.estimators_.shape[0], 30) assert_equal(est.train_score_.shape[0], 30) assert_equal(est.oob_improvement_.shape[0], 30) est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True) est.fit(X, y, monitor=early_stopping_monitor) assert_equal(est.n_estimators, 20) assert_equal(est.estimators_.shape[0], 10) assert_equal(est.train_score_.shape[0], 10) assert_equal(est.oob_improvement_.shape[0], 10) assert_equal(est._oob_score_.shape[0], 10) # try refit est.set_params(n_estimators=30, warm_start=False) est.fit(X, y) assert_equal(est.n_estimators, 30) assert_equal(est.train_score_.shape[0], 30) assert_equal(est.estimators_.shape[0], 30) assert_equal(est.oob_improvement_.shape[0], 30)
def test(): X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X = X.astype(np.float32) labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] df = pd.DataFrame(X_train) print df.describe() s = pd.Series(y_train) s = s.astype('str') print s.describe() clf = ensemble.GradientBoostingClassifier(n_estimators=50) clf.fit(X_train, y_train) analyze_clf(clf) print clf.score(X_train, y_train) print clf.score(X_test, y_test)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([tree.random_state for tree in clf_ws]) == set([tree.random_state for tree in clf_no_ws]))
def test_classification_synthetic(loss): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.09 gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = SerialBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 assert_warns_message( UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train, ) assert_array_equal(y_pred, clf.predict(X_test))
def test_estimators_samples(): # Check that format of estimators_samples_ is correct X, y = make_hastie_10_2(n_samples=100, random_state=1) # remap the y outside of the SMOTEBagging # _, y = np.unique(y, return_inverse=True) bagging = SMOTEBagging(LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=0, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) assert estimators_samples[0].dtype.kind == 'b'
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier( LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False, ) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.coef_ estimator.fit(X_train, y_train) new_coefs = estimator.coef_ assert_array_almost_equal(orig_coefs, new_coefs)
def check_warm_start_equal_n_estimators(name): """Test if warm start with equal n_estimators does nothing and returns the same forest and raises a warning.""" X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) ForestEstimator = FOREST_ESTIMATORS[name] clf = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) clf_2.fit(X, y) # Now clf_2 equals clf. clf_2.set_params(random_state=2) assert_warns(UserWarning, clf_2.fit, X, y) # If we had fit the trees again we would have got a different forest as we # changed the random state. assert_array_equal(clf.apply(X), clf_2.apply(X))
def main(): # generate synthetic binary classification data # (name refers to example 10.2 in ESL textbook...see refs below) X, y = make_hastie_10_2() # perform train/test split (no need to shuffle) split_pt = int(TRAIN_PCT * len(X)) X_train, X_test = X[:split_pt], X[split_pt:] y_train, y_test = y[:split_pt], y[split_pt:] # single dec stump stump_clf = DecisionTreeClassifier(max_depth=1) stump_clf.fit(X_train, y_train) stump_score = round(stump_clf.score(X_test, y_test), 3) print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score) # single dec tree (max_depth=3) tree_clf = DecisionTreeClassifier(max_depth=5) tree_clf.fit(X_train, y_train) tree_score = round(tree_clf.score(X_test, y_test), 3) print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score) # gbt: a powerful ensemble technique gbt_scores = list() for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500): print 'fitting gbt for n_estimators = {}...'.format(k) gbt_clf = GradientBoostingClassifier( n_estimators=k, # number of weak learners for this iteration max_depth=1, # weak learners are dec stumps learning_rate=1.0) # regularization (shrinkage) hyperparam gbt_clf.fit(X_train, y_train) gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3)) print '\ngbt accuracy =\n{}'.format(gbt_scores)
def train_onnx(): # Example BDT creation from: https://scikit-learn.org/stable/modules/ensemble.html from sklearn.datasets import make_hastie_10_2 from sklearn.ensemble import GradientBoostingClassifier # Make a random dataset from onnxearn 'hastie' X, y = make_hastie_10_2(random_state=0) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] # Train a BDT clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=3, random_state=0).fit(X_train, y_train) from skl2onnx.common.data_types import FloatTensorType initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))] clf = onnxmltools.convert_sklearn(clf, 'Hastie model', initial_types=initial_type) onnx.save(clf, 'hastie_bdt.onnx') return clf, X_test, y_test, 'hastie_bdt.onnx'
def test_classification_synthetic(): """Test GradientBoostingClassifier on synthetic dataset used by Hastie et al. in ESLII Example 12.7. """ X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1, max_depth=1, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.085, \ "GB failed with error %.4f" % error_rate gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08, \ "Stochastic GB failed with error %.4f" % error_rate
def check_decision_path(name): X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) n_samples = X.shape[0] ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) est.fit(X, y) indicator, n_nodes_ptr = est.decision_path(X) assert_equal(indicator.shape[1], n_nodes_ptr[-1]) assert_equal(indicator.shape[0], n_samples) assert_array_equal(np.diff(n_nodes_ptr), [e.tree_.node_count for e in est.estimators_]) # Assert that leaves index are correct leaves = est.apply(X) for est_id in range(leaves.shape[1]): leave_indicator = [ indicator[i, n_nodes_ptr[est_id] + j] for i, j in enumerate(leaves[:, est_id]) ] assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
# Raghav RV <*****@*****.**> # License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from sklearn import ensemble from sklearn import datasets from sklearn.model_selection import train_test_split data_list = [ datasets.load_iris(return_X_y=True), datasets.make_classification(n_samples=800, random_state=0), datasets.make_hastie_10_2(n_samples=2000, random_state=0), ] names = ["Iris Data", "Classification Data", "Hastie Data"] n_gb = [] score_gb = [] time_gb = [] n_gbes = [] score_gbes = [] time_gbes = [] n_estimators = 200 for X, y in data_list: X_train, X_test, y_train, y_test = train_test_split(X, y,
# License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from sklearn import ensemble from sklearn import datasets from sklearn.model_selection import train_test_split print(__doc__) data_list = [datasets.load_iris(), datasets.load_digits()] data_list = [(d.data, d.target) for d in data_list] data_list += [datasets.make_hastie_10_2()] names = ['Iris Data', 'Digits Data', 'Hastie Data'] n_gb = [] score_gb = [] time_gb = [] n_gbes = [] score_gbes = [] time_gbes = [] n_estimators = 500 for X, y in data_list: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
def test_min_weight_fraction_leaf(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) X = X.astype(np.float32) for name in FOREST_ESTIMATORS: yield check_min_weight_fraction_leaf, name, X, y
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.metrics import zero_one_loss from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier # 设置 AdaBoost 迭代次数 n_estimators = 200 # 使用 X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) # 从 12000 个数据中取前 2000 行作为测试集,其余作为训练集 test_x, test_y = X[2000:], y[2000:] train_x, train_y = X[:2000], y[:2000] # 弱分类器 dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(train_x, train_y) dt_stump_err = 1.0 - dt_stump.score(test_x, test_y) # 决策树分类器 dt = DecisionTreeClassifier() dt.fit(train_x, train_y) dt_err = 1.0 - dt.score(test_x, test_y) # AdaBoost 分类器 ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators) ada.fit(train_x, train_y) # 三个分类器的错误率可视化 fig = plt.figure() # 设置 plt 正确显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率')
return error_rate(pred_train,y_train), error_rate(pred_test,y_test) #plot error curve def plot_error_rate(er_train,er_test): df_err =pd.DataFrame([er_train,er_test]).T df_err.columns = ["Train","Test"] plot1 = df_err.plot(linewidth =3,figsize =(8,6), color = ["lightblue","darkblue"],grid = True) plot1.set_xlabel('Number of iterations', fontsize=12) plot1.set_xticklabels(range(0, 450, 50)) plot1.set_ylabel('Error rate', fontsize=12) plot1.set_title('Error rate vs number of iterations', fontsize=16) plt.axhline(y=er_test[0], linewidth=1, color='red', ls='dashed') #just do it if __name__ =='__main__': x,y = make_hastie_10_2() X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42) svm_clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) er_train,er_test= initclf(X_train,y_train,X_test,y_test,svm_clf) er_train,er_test= [er_train],[er_test] xrange =(10,410,10) for i in xrange: er_train_i,er_test_i = adaboost(X_train,y_train,X_test,y_test,i,svm_clf) er_train.append(er_train_i) er_test.append(er_test_i) plot_error_rate(er_train,er_test)
# also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # Make regression dataset X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10, random_state=1) # also make a hastie_10_2 dataset hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) hastie_X = hastie_X.astype(np.float32) # Get the default backend in joblib to test parallelism and interaction with # different backends DEFAULT_JOBLIB_BACKEND = joblib.parallel.get_active_backend()[0].__class__ FOREST_CLASSIFIERS = { "ExtraTreesClassifier": ExtraTreesClassifier, "RandomForestClassifier": RandomForestClassifier, } FOREST_REGRESSORS = { "ExtraTreesRegressor": ExtraTreesRegressor, "RandomForestRegressor": RandomForestRegressor, }
def test_max_leaf_nodes_max_depth(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for name in FOREST_ESTIMATORS: yield check_max_leaf_nodes_max_depth, name, X, y
def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) with pytest.raises(ValueError): clf.fit(X, y)
for sample, style in (('train', '--'), ('test', '-')): sample_score_mean = results['mean_%s_%s' % (sample, scorer)] sample_score_std = results['std_%s_%s' % (sample, scorer)] ax.fill_between(X_axis, sample_score_mean - sample_score_std, sample_score_mean + sample_score_std, alpha=0.1 if sample == 'test' else 0, color=color) ax.plot(X_axis, sample_score_mean, style, color=color, alpha=1 if sample == 'test' else 0.7, label="%s (%s)" % (scorer, sample)) best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0] best_score = results['mean_test_%s' % scorer][best_index] # Plot a dotted vertical line at the best score for that scorer marked by x ax.plot([X_axis[best_index], ] * 2, [0, best_score], linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8) # Annotate the best score for that scorer ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005)) plt.legend(loc="best") plt.grid(False) plt.show() if __name__ == '__main__': X, y = make_hastie_10_2(n_samples=8000, random_state=42) clf = DecisionTreeClassifier(random_state=42) title = 'Decision Tree Classifier using multiple scorers simultaneously' params = {'max_depth': range(1, 52, 2)} GridSearch(clf, X, y, title, params)