def test_dtype_of_classifier_probas(strategy): y = [0, 2, 1, 1] X = np.zeros(4) model = DummyClassifier(strategy=strategy, random_state=0, constant=0) probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64
def test_constant_strategy_exceptions(): X = [[0], [0], [0], [0]] # ignored y = [2, 1, 2, 2] clf = DummyClassifier(strategy="constant", random_state=0) assert_raises(ValueError, clf.fit, X, y) clf = DummyClassifier(strategy="constant", random_state=0, constant=[2, 0]) assert_raises(ValueError, clf.fit, X, y)
def test_uniform_strategy(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y)
def test_classifier_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf1.fit(X1, y) predictions1 = clf1.predict(X1) X2 = [[1]] * 4 clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf2.fit(X2, y) predictions2 = clf2.predict(X2) assert_array_equal(predictions1, predictions2)
def test_most_frequent_and_prior_strategy_with_2d_column_y(): # non-regression test added in # https://github.com/scikit-learn/scikit-learn/pull/13545 X = [[0], [0], [0], [0]] y_1d = [1, 2, 1, 1] y_2d = [[1], [2], [1], [1]] for strategy in ("most_frequent", "prior"): clf_1d = DummyClassifier(strategy=strategy, random_state=0) clf_2d = DummyClassifier(strategy=strategy, random_state=0) clf_1d.fit(X, y_1d) clf_2d.fit(X, y_2d) assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def test_constant_strategy(): X = [[0], [0], [0], [0]] # ignored y = [2, 1, 2, 2] clf = DummyClassifier(strategy="constant", random_state=0, constant=1) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) X = [[0], [0], [0], [0]] # ignored y = ['two', 'one', 'two', 'two'] clf = DummyClassifier(strategy="constant", random_state=0, constant='one') clf.fit(X, y) assert_array_equal(clf.predict(X), np.array(['one'] * 4)) _check_predict_proba(clf, X, y)
def test_classification_sample_weight(): X = [[0], [0], [1]] y = [0, 1, 0] sample_weight = [0.1, 1., 0.1] clf = DummyClassifier().fit(X, y, sample_weight) assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])
def test_constant_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]]) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y)
def test_constant_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]])) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) y_pred = clf.predict(X) assert sp.issparse(y_pred) assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
def test_most_frequent_and_prior_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]])) n_samples = len(X) y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]) for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) y_pred = clf.predict(X) assert sp.issparse(y_pred) assert_array_equal(y_pred.toarray(), y_expected)
def test_most_frequent_and_prior_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]]) n_samples = len(X) for strategy in ("prior", "most_frequent"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_uniform_strategy_multioutput(): X = [[0]] * 4 # ignored y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]]) clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_uniform_strategy_sparse_target_warning(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]])) clf = DummyClassifier(strategy="uniform", random_state=0) assert_warns_message(UserWarning, "the uniform strategy would not save memory", clf.fit, X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 1 / 3, decimal=1) assert_almost_equal(p[2], 1 / 3, decimal=1) assert_almost_equal(p[4], 1 / 3, decimal=1)
def test_stratified_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]])) clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) assert sp.issparse(y_pred) y_pred = y_pred.toarray() for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 3. / 5, decimal=1) assert_almost_equal(p[0], 1. / 5, decimal=1) assert_almost_equal(p[4], 1. / 5, decimal=1)
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features]) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features]) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_warning_recursion_non_constant_init(): # make sure that passing a non-constant init parameter to a GBDT and using # recursion method yields a warning. gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0) gbc.fit(X, y) with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion') with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion')
def test_dummy_classifier_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = [2, 2, 2] y_expected = [2, 2, 2] y_proba_expected = [[1], [1], [1]] cls = DummyClassifier() cls.fit(X, y) y_pred = cls.predict(X) y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected)
def test_most_frequent_and_prior_strategy(): X = [[0], [0], [0], [0]] # ignored y = [1, 2, 1, 1] for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) if strategy == "prior": assert_array_almost_equal(clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))) else: assert_array_almost_equal(clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5)
def test_multidimensional_X(): """ Check that the AdaBoost estimators can work with n-dimensional data matrix """ from mrex.dummy import DummyClassifier, DummyRegressor rng = np.random.RandomState(0) X = rng.randn(50, 3, 3) yc = rng.choice([0, 1], 50) yr = rng.randn(50) boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent')) boost.fit(X, yc) boost.predict(X) boost.predict_proba(X) boost = AdaBoostRegressor(DummyRegressor()) boost.fit(X, yr) boost.predict(X)
from mrex.dummy import DummyClassifier from mrex.datasets import fetch_20newsgroups_vectorized from mrex.metrics import accuracy_score from mrex.utils.validation import check_array from mrex.ensemble import RandomForestClassifier from mrex.ensemble import ExtraTreesClassifier from mrex.ensemble import AdaBoostClassifier from mrex.linear_model import LogisticRegression from mrex.naive_bayes import MultinomialNB ESTIMATORS = { "dummy": DummyClassifier(), "random_forest": RandomForestClassifier(max_features="sqrt", min_samples_split=10), "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10), "logistic_regression": LogisticRegression(), "naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), } ############################################################################### # Data
# Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), 'CART': DecisionTreeClassifier(), 'ExtraTrees': ExtraTreesClassifier(), 'RandomForest': RandomForestClassifier(), 'Nystroem-SVM': make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'SampledRBF-SVM': make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4), 'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1, C=1e4), 'MultilayerPerceptron': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1),
def test_string_labels(): X = [[0]] * 5 y = ["paris", "paris", "tokyo", "amsterdam", "berlin"] clf = DummyClassifier(strategy="most_frequent") clf.fit(X, y) assert_array_equal(clf.predict(X), ["paris"] * 5)
def test_classifier_score_with_None(y, y_test): clf = DummyClassifier(strategy="most_frequent") clf.fit(None, y) assert clf.score(None, y_test) == 0.5
def test_classifier_exceptions(): clf = DummyClassifier(strategy="unknown") assert_raises(ValueError, clf.fit, [], []) assert_raises(NotFittedError, clf.predict, []) assert_raises(NotFittedError, clf.predict_proba, [])