def test_oneclass_adaboost_proba(): # Test predict_proba robustness for one class label input. # In response to issue #7501 # https://github.com/scikit-learn/scikit-learn/issues/7501 y_t = np.ones(len(X)) clf = AdaBoostClassifier().fit(X, y_t) assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def test_pickle(): # Check pickability. import pickle # Adaboost classifier for alg in ['SAMME', 'SAMME.R']: obj = AdaBoostClassifier(algorithm=alg) obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert type(obj2) == obj.__class__ score2 = obj2.score(iris.data, iris.target) assert score == score2 # Adaboost regressor obj = AdaBoostRegressor(random_state=0) obj.fit(boston.data, boston.target) score = obj.score(boston.data, boston.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert type(obj2) == obj.__class__ score2 = obj2.score(boston.data, boston.target) assert score == score2
def test_error(): # Test that it gives proper exception on deficient input. assert_raises(ValueError, AdaBoostClassifier(learning_rate=-1).fit, X, y_class) assert_raises(ValueError, AdaBoostClassifier(algorithm="foo").fit, X, y_class) assert_raises(ValueError, AdaBoostClassifier().fit, X, y_class, sample_weight=np.asarray([-1]))
def test_sample_weight_missing(): from mrex.cluster import KMeans clf = AdaBoostClassifier(KMeans(), algorithm="SAMME") assert_raises(ValueError, clf.fit, X, y_regr) clf = AdaBoostRegressor(KMeans()) assert_raises(ValueError, clf.fit, X, y_regr)
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert proba.shape[1] == len(classes) assert clf.decision_function(iris.data).shape[1] == len(classes) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Check we used multiple estimators assert len(clf.estimators_) > 1 # Check for distinct random states (see issue #7408) assert (len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert importances.shape[0] == 10 assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_classification_toy(algorithm): # Check classification on a toy dataset. clf = AdaBoostClassifier(algorithm=algorithm, random_state=0) clf.fit(X, y_class) assert_array_equal(clf.predict(T), y_t_class) assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) assert clf.predict_proba(T).shape == (len(T), 2) assert clf.decision_function(T).shape == (len(T), )
def test_adaboost_consistent_predict(algorithm): # check that predict_proba and predict give consistent results # regression test for: # https://github.com/scikit-learn/scikit-learn/issues/14084 X_train, X_test, y_train, y_test = train_test_split( *datasets.load_digits(return_X_y=True), random_state=42) model = AdaBoostClassifier(algorithm=algorithm, random_state=42) model.fit(X_train, y_train) assert_array_equal(np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test))
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = { 'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R') } clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def test_multidimensional_X(): """ Check that the AdaBoost estimators can work with n-dimensional data matrix """ from mrex.dummy import DummyClassifier, DummyRegressor rng = np.random.RandomState(0) X = rng.randn(50, 3, 3) yc = rng.choice([0, 1], 50) yr = rng.randn(50) boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent')) boost.fit(X, yc) boost.predict(X) boost.predict_proba(X) boost = AdaBoostRegressor(DummyRegressor()) boost.fit(X, yr) boost.predict(X)
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [ csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix ]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME").fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME").fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_almost_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0),
n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
def test_staged_predict(algorithm): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) boston_weights = rng.randint(10, size=boston.target.shape) clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score( iris.data, iris.target, sample_weight=iris_weights) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_probas) == 10 assert_array_almost_equal(proba, staged_probas[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(boston.data, boston.target, sample_weight=boston_weights) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target, sample_weight=boston_weights) staged_scores = [ s for s in clf.staged_score( boston.data, boston.target, sample_weight=boston_weights) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1])
n_classes = 3 n_estimators = 30 cmap = plt.cm.RdYlBu plot_step = 0.02 # fine step width for decision surface contours plot_step_coarser = 0.5 # step widths for coarse classifier guesses RANDOM_SEED = 13 # fix the seed on each iteration # Load data iris = load_iris() plot_idx = 1 models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)] for pair in ([0, 1], [0, 2], [2, 3]): for model in models: # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Shuffle idx = np.arange(X.shape[0]) np.random.seed(RANDOM_SEED) np.random.shuffle(idx) X = X[idx] y = y[idx] # Standardize
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(X_train, y_train) fig = plt.figure() ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label='Decision Stump Error')
def test_base_estimator(): # Test different base estimators. from mrex.ensemble import RandomForestClassifier # XXX doesn't work with y_class because RF doesn't support classes_ # Shouldn't AdaBoost run a LabelBinarizer? clf = AdaBoostClassifier(RandomForestClassifier()) clf.fit(X, y_regr) clf = AdaBoostClassifier(SVC(), algorithm="SAMME") clf.fit(X, y_class) from mrex.ensemble import RandomForestRegressor clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) clf.fit(X, y_regr) clf = AdaBoostRegressor(SVR(), random_state=0) clf.fit(X, y_regr) # Check that an empty discrete ensemble fails in fit, not predict. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] y_fail = ["foo", "bar", 1, 2] clf = AdaBoostClassifier(SVC(), algorithm="SAMME") assert_raises_regexp(ValueError, "worse than random", clf.fit, X_fail, y_fail)
from mrex.ensemble import AdaBoostClassifier from mrex.metrics import accuracy_score from mrex.tree import DecisionTreeClassifier X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip( bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
from mrex.linear_model import LogisticRegression from mrex.naive_bayes import MultinomialNB ESTIMATORS = { "dummy": DummyClassifier(), "random_forest": RandomForestClassifier(max_features="sqrt", min_samples_split=10), "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10), "logistic_regression": LogisticRegression(), "naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), } ############################################################################### # Data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', '--estimators', nargs="+", required=True, choices=ESTIMATORS) args = vars(parser.parse_args())