Example #1
0
def test_classif_binary(weighting):
    clf = RobustWeightedClassifier(
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        multi_class="binary",
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss="log", random_state=rng)
    clf.fit(X_cb, y_cb)
    clf_not_rob.fit(X_cb, y_cb)
    norm_coef1 = np.linalg.norm(np.hstack([clf.coef_.ravel(), clf.intercept_]))
    norm_coef2 = np.linalg.norm(
        np.hstack([clf_not_rob.coef_.ravel(), clf_not_rob.intercept_])
    )
    coef1 = clf.coef_ / norm_coef1
    coef2 = clf_not_rob.coef_ / norm_coef2

    intercept1 = clf.intercept_ / norm_coef1
    intercept2 = clf_not_rob.intercept_ / norm_coef2

    assert np.linalg.norm(coef1 - coef2) < 0.5
    assert np.linalg.norm(intercept1 - intercept2) < 0.5

    assert len(clf.weights_) == len(X_cb)
Example #2
0
def test_classif_corrupted_weights(weighting):
    clf = RobustWeightedClassifier(
        max_iter=100,
        weighting=weighting,
        k=5,
        c=1,
        burn_in=0,
        multi_class="binary",
        random_state=rng,
    )
    clf.fit(X_cc, y_cc)
    assert np.mean(clf.weights_[:3]) < np.mean(clf.weights_[3:])
Example #3
0
def test_corrupted_classif(loss, weighting, k, c, multi_class):
    clf = RobustWeightedClassifier(
        loss=loss,
        max_iter=100,
        weighting=weighting,
        k=k,
        c=c,
        multi_class=multi_class,
        random_state=rng,
    )
    clf.fit(X_cc, y_cc)
    score = clf.score(X_cc, y_cc)
    assert score > 0.8
Example #4
0
def test_predict_proba(weighting):
    clf = RobustWeightedClassifier(
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss="log", random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.base_estimator_.predict_proba(X_c)[:, 1]
    pred2 = clf_not_rob.predict_proba(X_c)[:, 1]

    assert np.mean((pred1 > 1 / 2) == (pred2 > 1 / 2)) > 0.8
def test_not_robust_classif(loss, weighting, multi_class):
    clf = RobustWeightedClassifier(
        loss=loss,
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        multi_class=multi_class,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss=loss, random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.base_estimator_.decision_function(X_c)
    pred2 = clf_not_rob.decision_function(X_c)

    assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8
Example #6
0
def test_robust_estimator_input_validation_and_fit_check():
    # Invalid parameters
    msg = "max_iter must be > 0, got 0."
    with pytest.raises(ValueError, match=msg):
        RobustWeightedKMeans(max_iter=0).fit(X_cc)

    msg = "c must be > 0, got 0."
    with pytest.raises(ValueError, match=msg):
        RobustWeightedKMeans(c=0).fit(X_cc)

    msg = "burn_in must be >= 0, got -1."
    with pytest.raises(ValueError, match=msg):
        RobustWeightedClassifier(burn_in=-1).fit(X_cc, y_cc)

    msg = "eta0 must be > 0, got 0."
    with pytest.raises(ValueError, match=msg):
        RobustWeightedClassifier(burn_in=1, eta0=0).fit(X_cc, y_cc)

    msg = "k must be integer >= 0, and smaller than floor"
    with pytest.raises(ValueError, match=msg):
        RobustWeightedKMeans(k=-1).fit(X_cc)
Example #7
0
def test_not_robust_classif(loss, weighting, multi_class):
    clf = RobustWeightedClassifier(
        loss=loss,
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        multi_class=multi_class,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss=loss, random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.predict(X_c)
    pred2 = clf_not_rob.predict(X_c)

    assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8
    assert clf.score(X_c, y_c) == np.mean(pred1 == y_c)
estimators = [
    (
        "SGDClassifier, Hinge loss",
        SGDClassifier(loss="hinge", random_state=rng),
    ),
    ("SGDClassifier, log loss", SGDClassifier(loss="log", random_state=rng)),
    (
        "SGDClassifier, modified_huber loss",
        SGDClassifier(loss="modified_huber", random_state=rng),
    ),
    (
        "RobustWeightedClassifier",
        RobustWeightedClassifier(
            max_iter=100,
            weighting="mom",
            k=8,
            random_state=rng,
        ),
        # The parameter k is set larger the number of outliers
        # because here we know it. max_iter is set to 100. One may want
        # to play with the number of iteration or the optimization scheme of
        # the base_estimator to get good results.
    ),
]


# Helping function to represent estimators
def plot_classif(clf, X, y, ax):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    h = 0.02  # step size in the mesh
Example #9
0
def test_robust_estimator_unsupported_multiclass():
    """Test that warning message is thrown when unsupported weighting."""
    model = RobustWeightedClassifier(multi_class="invalid")
    msg = "No such multiclass method implemented."
    with pytest.raises(ValueError, match=msg):
        model.fit(X_cc, y_cc)
Example #10
0
def test_robust_estimator_unsupported_weighting():
    """Test that warning message is thrown when unsupported weighting."""
    model = RobustWeightedClassifier(weighting="invalid")
    msg = "No such weighting scheme"
    with pytest.raises(ValueError, match=msg):
        model.fit(X_cc, y_cc)
Example #11
0
def test_robust_estimator_unsupported_loss():
    """Test that warning message is thrown when unsupported loss."""
    model = RobustWeightedClassifier(loss="invalid")
    msg = "The loss invalid is not supported. "
    with pytest.raises(ValueError, match=msg):
        model.fit(X_cc, y_cc)
Example #12
0
def test_robust_estimator_max_iter():
    """Test that warning message is thrown when max_iter is reached."""
    model = RobustWeightedClassifier(max_iter=1)
    msg = "Maximum number of iteration reached before"
    with pytest.warns(UserWarning, match=msg):
        model.fit(X_cc, y_cc)
Example #13
0
def test_robust_no_proba():
    est = RobustWeightedClassifier(loss="hinge").fit(X_c, y_c)
    msg = "Probability estimates are not available for loss='hinge'"
    with pytest.raises(AttributeError, match=msg):
        est.predict_proba(X_c)
Example #14
0
def test_robust_estimator_unsupported_loss():
    model = RobustWeightedClassifier(multi_class="binary")
    msg = "y must be binary."
    with pytest.raises(ValueError, match=msg):
        model.fit(X_c, y_c)
Example #15
0
# Scale the dataset with sklearn RobustScaler (important for this algorithm)
X = RobustScaler().fit_transform(X)

# Using GridSearchCV, to tune the parameters alpha, eta0, learning_rate, loss
# and average of SGDClassifier, we get the following parameters.

clf_not_rob = SGDClassifier(average=10, learning_rate="optimal", loss="hinge")

# Then, we use this estimator as base_estimator of RobustWeightedEstimator.
# Using GridSearchCV, we tuned the parameters c and eta0, with the
# choice of "huber" weighting because the sample_size is not very large.

clf_rob = RobustWeightedClassifier(
    weighting="huber",
    loss="hinge",
    c=1.35,
    eta0=1e-3,
    max_iter=300,
)

# We compute M times the cross validations in order to also have an estimate
# of the variance of the loss of the estimators.
M = 10
res = []
for f in range(M):
    rng = np.random.RandomState(f)
    print("\r Progress: %s / %s" % (f + 1, M), end="")
    clf = SGDClassifier(average=10,
                        learning_rate="optimal",
                        loss="hinge",
                        random_state=rng)