def test_dummy_classifier_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = [2, 2, 2] y_expected = [2, 2, 2] y_proba_expected = [[1], [1], [1]] cls = DummyClassifier() cls.fit(X, y) y_pred = cls.predict(X) y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected)
def test_uniform_strategy(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y)
def test_classifier_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf1.fit(X1, y) predictions1 = clf1.predict(X1) X2 = [[1]] * 4 clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf2.fit(X2, y) predictions2 = clf2.predict(X2) assert_array_equal(predictions1, predictions2)
def test_most_frequent_and_prior_strategy_with_2d_column_y(): # non-regression test added in # https://github.com/scikit-learn/scikit-learn/pull/13545 X = [[0], [0], [0], [0]] y_1d = [1, 2, 1, 1] y_2d = [[1], [2], [1], [1]] for strategy in ("most_frequent", "prior"): clf_1d = DummyClassifier(strategy=strategy, random_state=0) clf_2d = DummyClassifier(strategy=strategy, random_state=0) clf_1d.fit(X, y_1d) clf_2d.fit(X, y_2d) assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def test_constant_strategy(): X = [[0], [0], [0], [0]] # ignored y = [2, 1, 2, 2] clf = DummyClassifier(strategy="constant", random_state=0, constant=1) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) X = [[0], [0], [0], [0]] # ignored y = ['two', 'one', 'two', 'two'] clf = DummyClassifier(strategy="constant", random_state=0, constant='one') clf.fit(X, y) assert_array_equal(clf.predict(X), np.array(['one'] * 4)) _check_predict_proba(clf, X, y)
def test_most_frequent_and_prior_strategy(): X = [[0], [0], [0], [0]] # ignored y = [1, 2, 1, 1] for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) if strategy == "prior": assert_array_almost_equal(clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))) else: assert_array_almost_equal(clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5)
def test_dtype_of_classifier_probas(strategy): y = [0, 2, 1, 1] X = np.zeros(4) model = DummyClassifier(strategy=strategy, random_state=0, constant=0) probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64
def test_constant_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]])) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) y_pred = clf.predict(X) assert sp.issparse(y_pred) assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
def test_constant_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]]) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y)
def test_most_frequent_and_prior_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]])) n_samples = len(X) y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]) for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) y_pred = clf.predict(X) assert sp.issparse(y_pred) assert_array_equal(y_pred.toarray(), y_expected)
def test_most_frequent_and_prior_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]]) n_samples = len(X) for strategy in ("prior", "most_frequent"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_uniform_strategy_multioutput(): X = [[0]] * 4 # ignored y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]]) clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_stratified_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]])) clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) assert sp.issparse(y_pred) y_pred = y_pred.toarray() for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 3. / 5, decimal=1) assert_almost_equal(p[0], 1. / 5, decimal=1) assert_almost_equal(p[4], 1. / 5, decimal=1)
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features]) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features]) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_classifier_score_with_None(y, y_test): clf = DummyClassifier(strategy="most_frequent") clf.fit(None, y) assert clf.score(None, y_test) == 0.5
def test_string_labels(): X = [[0]] * 5 y = ["paris", "paris", "tokyo", "amsterdam", "berlin"] clf = DummyClassifier(strategy="most_frequent") clf.fit(X, y) assert_array_equal(clf.predict(X), ["paris"] * 5)