Ejemplo n.º 1
0
def test_multiclass_multioutput_estimator_predict_proba():
    seed = 542

    # make test deterministic
    rng = np.random.RandomState(seed)

    # random features
    X = rng.normal(size=(5, 5))

    # random labels
    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes

    Y = np.concatenate([y1, y2], axis=1)

    clf = MultiOutputClassifier(
        LogisticRegression(solver='liblinear', random_state=seed))

    clf.fit(X, Y)

    y_result = clf.predict_proba(X)
    y_actual = [
        np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928],
                  [0.54681448, 0.45318552], [0.34883923, 0.65116077],
                  [0.73687069, 0.26312931]]),
        np.array([[0.5171785, 0.23878628, 0.24403522],
                  [0.22141451, 0.64102704, 0.13755846],
                  [0.16751315, 0.18256843, 0.64991843],
                  [0.27357372, 0.55201592, 0.17441036],
                  [0.65745193, 0.26062899, 0.08191907]])
    ]

    for i in range(len(y_actual)):
        assert_almost_equal(y_result[i], y_actual[i])
Ejemplo n.º 2
0
def test_multi_output_exceptions():
    # NotFittedError when fit is not done but score, predict and
    # and predict_proba are called
    moc = MultiOutputClassifier(LinearSVC(random_state=0))
    assert_raises(NotFittedError, moc.predict, y)
    assert_raises(NotFittedError, moc.predict_proba, y)
    assert_raises(NotFittedError, moc.score, X, y)
    # ValueError when number of outputs is different
    # for fit and score
    y_new = np.column_stack((y1, y2))
    moc.fit(X, y)
    assert_raises(ValueError, moc.score, X, y_new)
    # ValueError when y is continuous
    assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1])
Ejemplo n.º 3
0
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(X[:half_index],
                                    y[:half_index],
                                    classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert (n_samples, n_outputs) == first_predictions.shape

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert (n_samples, n_outputs) == second_predictions.shape

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(X[:half_index],
                                   y[:half_index, i],
                                   classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
Ejemplo n.º 4
0
def test_multiclass_multioutput_estimator():
    # test to check meta of meta estimators
    svc = LinearSVC(random_state=0)
    multi_class_svc = OneVsRestClassifier(svc)
    multi_target_svc = MultiOutputClassifier(multi_class_svc)

    multi_target_svc.fit(X, y)

    predictions = multi_target_svc.predict(X)
    assert (n_samples, n_outputs) == predictions.shape

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        multi_class_svc_ = clone(multi_class_svc)  # create a clone
        multi_class_svc_.fit(X, y[:, i])
        assert (list(multi_class_svc_.predict(X)) == list(predictions[:, i]))
Ejemplo n.º 5
0
def test_multi_output_classification_partial_fit_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    yw = [[3, 2], [2, 3], [3, 2]]
    w = np.asarray([2., 1., 1.])
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
    clf_w = MultiOutputClassifier(sgd_linear_clf)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
    clf = MultiOutputClassifier(sgd_linear_clf)
    clf.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
Ejemplo n.º 6
0
def test_multi_output_classification_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6]]
    yw = [[3, 2], [2, 3]]
    w = np.asarray([2., 1.])
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf_w = MultiOutputClassifier(forest)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
    y = [[3, 2], [3, 2], [2, 3]]
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf = MultiOutputClassifier(forest)
    clf.fit(X, y)

    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
    assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
Ejemplo n.º 7
0
def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert (n_samples, n_outputs) == predictions.shape

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert (n_samples, n_classes) == class_probabilities.shape

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert list(forest_.predict(X)) == list(predictions[:, i])
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
Ejemplo n.º 8
0
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert est1 is not est2
Ejemplo n.º 9
0
def test_multi_output_classification_partial_fit_no_first_classes_exception():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    assert_raises_regex(
        ValueError, "classes must be passed on the first call "
        "to partial_fit.", multi_target_linear.partial_fit, X, y)
Ejemplo n.º 10
0
def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0

    grid_clf = GridSearchCV(sgd_linear_clf,
                            param_grid=param,
                            scoring=custom_scorer,
                            cv=3)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(ValueError, match=err_msg):
        multi_target_linear.predict_proba(X)
Ejemplo n.º 11
0
            RegressorChain(Ridge())
    ]:
        chain.fit(X, Y)
        chain_cv = clone(chain).set_params(cv=3)
        chain_cv.fit(X, Y)
        Y_pred_cv = chain_cv.predict(X)
        Y_pred = chain.predict(X)

        assert Y_pred_cv.shape == Y_pred.shape
        assert not np.all(Y_pred == Y_pred_cv)
        if isinstance(chain, ClassifierChain):
            assert jaccard_score(Y, Y_pred_cv, average='samples') > .4
        else:
            assert mean_squared_error(Y, Y_pred_cv) < .25


@pytest.mark.parametrize('estimator', [
    RandomForestClassifier(n_estimators=2),
    MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
    ClassifierChain(RandomForestClassifier(n_estimators=2))
])
def test_multi_output_classes_(estimator):
    # Tests classes_ attribute of multioutput classifiers
    # RandomForestClassifier supports multioutput out-of-the-box
    estimator.fit(X, y)
    assert isinstance(estimator.classes_, list)
    assert len(estimator.classes_) == n_outputs
    for estimator_classes, expected_classes in zip(classes,
                                                   estimator.classes_):
        assert_array_equal(estimator_classes, expected_classes)