def test_classif_binary(weighting): clf = RobustWeightedClassifier( max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, multi_class="binary", random_state=rng, ) clf_not_rob = SGDClassifier(loss="log", random_state=rng) clf.fit(X_cb, y_cb) clf_not_rob.fit(X_cb, y_cb) norm_coef1 = np.linalg.norm(np.hstack([clf.coef_.ravel(), clf.intercept_])) norm_coef2 = np.linalg.norm( np.hstack([clf_not_rob.coef_.ravel(), clf_not_rob.intercept_]) ) coef1 = clf.coef_ / norm_coef1 coef2 = clf_not_rob.coef_ / norm_coef2 intercept1 = clf.intercept_ / norm_coef1 intercept2 = clf_not_rob.intercept_ / norm_coef2 assert np.linalg.norm(coef1 - coef2) < 0.5 assert np.linalg.norm(intercept1 - intercept2) < 0.5 assert len(clf.weights_) == len(X_cb)
def test_classif_corrupted_weights(weighting): clf = RobustWeightedClassifier( max_iter=100, weighting=weighting, k=5, c=1, burn_in=0, multi_class="binary", random_state=rng, ) clf.fit(X_cc, y_cc) assert np.mean(clf.weights_[:3]) < np.mean(clf.weights_[3:])
def test_corrupted_classif(loss, weighting, k, c, multi_class): clf = RobustWeightedClassifier( loss=loss, max_iter=100, weighting=weighting, k=k, c=c, multi_class=multi_class, random_state=rng, ) clf.fit(X_cc, y_cc) score = clf.score(X_cc, y_cc) assert score > 0.8
def test_predict_proba(weighting): clf = RobustWeightedClassifier( max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, random_state=rng, ) clf_not_rob = SGDClassifier(loss="log", random_state=rng) clf.fit(X_c, y_c) clf_not_rob.fit(X_c, y_c) pred1 = clf.base_estimator_.predict_proba(X_c)[:, 1] pred2 = clf_not_rob.predict_proba(X_c)[:, 1] assert np.mean((pred1 > 1 / 2) == (pred2 > 1 / 2)) > 0.8
def test_not_robust_classif(loss, weighting, multi_class): clf = RobustWeightedClassifier( loss=loss, max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, multi_class=multi_class, random_state=rng, ) clf_not_rob = SGDClassifier(loss=loss, random_state=rng) clf.fit(X_c, y_c) clf_not_rob.fit(X_c, y_c) pred1 = clf.base_estimator_.decision_function(X_c) pred2 = clf_not_rob.decision_function(X_c) assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8
def test_robust_estimator_input_validation_and_fit_check(): # Invalid parameters msg = "max_iter must be > 0, got 0." with pytest.raises(ValueError, match=msg): RobustWeightedKMeans(max_iter=0).fit(X_cc) msg = "c must be > 0, got 0." with pytest.raises(ValueError, match=msg): RobustWeightedKMeans(c=0).fit(X_cc) msg = "burn_in must be >= 0, got -1." with pytest.raises(ValueError, match=msg): RobustWeightedClassifier(burn_in=-1).fit(X_cc, y_cc) msg = "eta0 must be > 0, got 0." with pytest.raises(ValueError, match=msg): RobustWeightedClassifier(burn_in=1, eta0=0).fit(X_cc, y_cc) msg = "k must be integer >= 0, and smaller than floor" with pytest.raises(ValueError, match=msg): RobustWeightedKMeans(k=-1).fit(X_cc)
def test_not_robust_classif(loss, weighting, multi_class): clf = RobustWeightedClassifier( loss=loss, max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, multi_class=multi_class, random_state=rng, ) clf_not_rob = SGDClassifier(loss=loss, random_state=rng) clf.fit(X_c, y_c) clf_not_rob.fit(X_c, y_c) pred1 = clf.predict(X_c) pred2 = clf_not_rob.predict(X_c) assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8 assert clf.score(X_c, y_c) == np.mean(pred1 == y_c)
estimators = [ ( "SGDClassifier, Hinge loss", SGDClassifier(loss="hinge", random_state=rng), ), ("SGDClassifier, log loss", SGDClassifier(loss="log", random_state=rng)), ( "SGDClassifier, modified_huber loss", SGDClassifier(loss="modified_huber", random_state=rng), ), ( "RobustWeightedClassifier", RobustWeightedClassifier( max_iter=100, weighting="mom", k=8, random_state=rng, ), # The parameter k is set larger the number of outliers # because here we know it. max_iter is set to 100. One may want # to play with the number of iteration or the optimization scheme of # the base_estimator to get good results. ), ] # Helping function to represent estimators def plot_classif(clf, X, y, ax): x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 h = 0.02 # step size in the mesh
def test_robust_estimator_unsupported_multiclass(): """Test that warning message is thrown when unsupported weighting.""" model = RobustWeightedClassifier(multi_class="invalid") msg = "No such multiclass method implemented." with pytest.raises(ValueError, match=msg): model.fit(X_cc, y_cc)
def test_robust_estimator_unsupported_weighting(): """Test that warning message is thrown when unsupported weighting.""" model = RobustWeightedClassifier(weighting="invalid") msg = "No such weighting scheme" with pytest.raises(ValueError, match=msg): model.fit(X_cc, y_cc)
def test_robust_estimator_unsupported_loss(): """Test that warning message is thrown when unsupported loss.""" model = RobustWeightedClassifier(loss="invalid") msg = "The loss invalid is not supported. " with pytest.raises(ValueError, match=msg): model.fit(X_cc, y_cc)
def test_robust_estimator_max_iter(): """Test that warning message is thrown when max_iter is reached.""" model = RobustWeightedClassifier(max_iter=1) msg = "Maximum number of iteration reached before" with pytest.warns(UserWarning, match=msg): model.fit(X_cc, y_cc)
def test_robust_no_proba(): est = RobustWeightedClassifier(loss="hinge").fit(X_c, y_c) msg = "Probability estimates are not available for loss='hinge'" with pytest.raises(AttributeError, match=msg): est.predict_proba(X_c)
def test_robust_estimator_unsupported_loss(): model = RobustWeightedClassifier(multi_class="binary") msg = "y must be binary." with pytest.raises(ValueError, match=msg): model.fit(X_c, y_c)
# Scale the dataset with sklearn RobustScaler (important for this algorithm) X = RobustScaler().fit_transform(X) # Using GridSearchCV, to tune the parameters alpha, eta0, learning_rate, loss # and average of SGDClassifier, we get the following parameters. clf_not_rob = SGDClassifier(average=10, learning_rate="optimal", loss="hinge") # Then, we use this estimator as base_estimator of RobustWeightedEstimator. # Using GridSearchCV, we tuned the parameters c and eta0, with the # choice of "huber" weighting because the sample_size is not very large. clf_rob = RobustWeightedClassifier( weighting="huber", loss="hinge", c=1.35, eta0=1e-3, max_iter=300, ) # We compute M times the cross validations in order to also have an estimate # of the variance of the loss of the estimators. M = 10 res = [] for f in range(M): rng = np.random.RandomState(f) print("\r Progress: %s / %s" % (f + 1, M), end="") clf = SGDClassifier(average=10, learning_rate="optimal", loss="hinge", random_state=rng)