Beispiel #1
0
def test_that_array_conversion_is_ok():
    import pandas as pd

    n_samples = 20
    X, y = simulate_linear(n_samples)
    X_df = pd.DataFrame(X)

    weird = {0: "neg", 1: "pos"}
    y_weird = [weird[yi] for yi in y]

    br = Classifier(tol=1e-17, max_iter=200).fit(X_df, y_weird)
    lr = LogisticRegression(tol=1e-17, max_iter=200).fit(X_df, y_weird)

    assert br.intercept_ == pytest.approx(lr.intercept_, abs=1e-4)
    assert br.coef_ == pytest.approx(lr.coef_, abs=1e-4)

    # And test prediction methods
    assert lr.decision_function(X) == pytest.approx(br.decision_function(X),
                                                    abs=1e-4)
    assert lr.predict_proba(X) == pytest.approx(br.predict_proba(X), abs=1e-4)
    assert lr.predict_log_proba(X) == pytest.approx(br.predict_log_proba(X),
                                                    abs=1e-4)
    assert (lr.predict(X) == br.predict(X)).any()
Beispiel #2
0
def test_fit_same_sklearn_logistic(fit_intercept, penalty, C, l1_ratio,
                                   solver):
    """
    This is a test that checks on many combinations that Classifier gets the
    same coef_ and intercept_ as scikit-learn on simulated data
    """
    n_samples = 128
    n_features = 5
    tol = 1e-10
    max_iter = 300
    verbose = False

    X, y = simulate_true_logistic(
        n_samples=n_samples,
        n_features=n_features,
        fit_intercept=fit_intercept,
    )

    args = {
        "tol": tol,
        "max_iter": max_iter,
        "verbose": verbose,
        "fit_intercept": fit_intercept,
        "random_state": 42,
    }

    if penalty == "none":
        # A single test is required for penalty="none"
        if C != 1.0 or l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty, solver="saga", **args)
    elif penalty == "l2":
        if l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        **args)
    elif penalty == "l1":
        if l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        **args)
    elif penalty == "elasticnet":
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        l1_ratio=l1_ratio,
                                        **args)
    else:
        raise ValueError("Weird penalty %r" % penalty)

    if solver in ["svrg", "saga", "gd"]:
        abs_approx, rel_approx = 1e-2, 1e-2
        args["step_size"] = 2.5
        if solver == "saga":
            args["max_iter"] = 400

    else:
        abs_approx, rel_approx = 1e-6, 1e-6

    clf_scikit.fit(X, y)
    # We compare with saga since it supports all penalties
    # clf_scikit = LogisticRegression(solver="saga", **args).fit(X, y)
    clf_linlearn = Classifier(penalty=penalty,
                              C=C,
                              l1_ratio=l1_ratio,
                              solver=solver,
                              **args)
    clf_linlearn.fit(X, y)

    # For some weird reason scikit's intercept_ does not match for "l1" and
    # "elasticnet" with intercept and for small C
    if not (penalty in ["l1", "elasticnet"] and fit_intercept and C < 1e-1):
        # Test the intercept_
        assert clf_scikit.intercept_ == pytest.approx(clf_linlearn.intercept_,
                                                      abs=abs_approx,
                                                      rel=rel_approx)
        # And test prediction methods
        assert clf_scikit.decision_function(X) == pytest.approx(
            clf_linlearn.decision_function(X), abs=abs_approx, rel=rel_approx)
        assert clf_scikit.predict_proba(X) == pytest.approx(
            clf_linlearn.predict_proba(X), abs=abs_approx, rel=rel_approx)
        assert clf_scikit.predict_log_proba(X) == pytest.approx(
            clf_linlearn.predict_log_proba(X), abs=abs_approx, rel=rel_approx)
        assert (clf_scikit.predict(X) == clf_linlearn.predict(X)).any()
        assert clf_scikit.score(X, y) == clf_linlearn.score(X, y)

    # And always test the coef_
    assert clf_scikit.coef_ == pytest.approx(clf_linlearn.coef_,
                                             abs=abs_approx,
                                             rel=rel_approx)
Beispiel #3
0
def test_fit_same_sklearn_simulated_multiclass(fit_intercept, penalty, C,
                                               l1_ratio, solver):
    """
    This is a test that checks on many combinations that Classifier gets the
    same coef_ and intercept_ as scikit-learn on simulated data
    """
    tol = 1e-10
    max_iter = 200  # so many iterations needed to reach necessary precision ...
    verbose = False
    step_size = 1.0

    n_samples = 128
    n_features = 5
    n_classes = 3
    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_classes=n_classes,
        n_informative=n_features,
        n_redundant=0,
        n_repeated=0,
        random_state=random_state,
    )
    # X, y = load_iris(return_X_y=True)

    args = {
        "tol": tol,
        "max_iter": max_iter,
        "verbose": verbose,
        "fit_intercept": fit_intercept,
        "random_state": 42,
        "multi_class": "multinomial",
    }

    # if solver in ["svrg", "saga", "gd"] and fit_intercept:
    #     abs_approx, rel_approx = 1e-4, 1e-4
    # else:
    #     abs_approx, rel_approx = 1e-6, 1e-6
    abs_approx, rel_approx = 1e-4, 1e-4  #######################################

    if penalty == "l1" and C == 1.0 and fit_intercept:
        args["max_iter"] = 300
    if penalty == "elasticnet" and C == 1.0:
        step_size = 1.0
        args["max_iter"] = 600
        abs_approx, rel_approx = 1e-2, 1e-2  #######################################
    if penalty == "elasticnet" and C == 1.0 and l1_ratio == 0.1:
        step_size = 1.0
        args["max_iter"] = 900
        abs_approx, rel_approx = 1e-2, 1e-2  #######################################
    if penalty == "elasticnet" and C == 1.0 and l1_ratio == 0.5:
        args["max_iter"] = 1000
        abs_approx, rel_approx = 1e-2, 1e-2  #######################################

    if penalty == "none":
        # A single test is required for penalty="none"
        if C != 1.0 or l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty, solver="saga", **args)
    elif penalty == "l2":
        if l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        **args)
    elif penalty == "l1":
        if l1_ratio != 0.5:
            return
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        **args)
    elif penalty == "elasticnet":
        clf_scikit = LogisticRegression(penalty=penalty,
                                        C=C,
                                        solver="saga",
                                        l1_ratio=l1_ratio,
                                        **args)
    else:
        raise ValueError("Weird penalty %r" % penalty)

    if solver == "gd":
        step_size = 1.5
    elif solver in ["svrg", "saga"]:
        step_size = 5.0

    clf_scikit.fit(X, y)
    # We compare with saga since it supports all penalties
    # clf_scikit = LogisticRegression(solver="saga", **args).fit(X, y)
    args.pop("multi_class")
    clf_linlearn = Classifier(penalty=penalty,
                              loss="multilogistic",
                              step_size=step_size,
                              C=C,
                              l1_ratio=l1_ratio,
                              solver=solver,
                              **args)
    clf_linlearn.fit(X, y)

    # For some weird reason scikit's intercept_ does not match for "l1" and
    # "elasticnet" with intercept and for small C
    if not (penalty in ["l1", "elasticnet"] and fit_intercept and C < 1e-1):
        # Test the intercept_
        assert clf_scikit.intercept_ == pytest.approx(clf_linlearn.intercept_,
                                                      abs=abs_approx,
                                                      rel=rel_approx)
        # And test prediction methods
        assert clf_scikit.decision_function(X) == pytest.approx(
            clf_linlearn.decision_function(X), abs=abs_approx, rel=rel_approx)
        assert clf_scikit.predict_proba(X) == pytest.approx(
            clf_linlearn.predict_proba(X), abs=abs_approx, rel=rel_approx)
        assert clf_scikit.predict_log_proba(X) == pytest.approx(
            clf_linlearn.predict_log_proba(X), abs=abs_approx, rel=rel_approx)
        assert (clf_scikit.predict(X) == clf_linlearn.predict(X)).any()
        assert clf_scikit.score(X, y) == clf_linlearn.score(X, y)

    # And always test the coef_
    assert clf_scikit.coef_ == pytest.approx(clf_linlearn.coef_,
                                             abs=abs_approx,
                                             rel=rel_approx)


# @pytest.mark.parametrize("fit_intercept", (False, True))
# @pytest.mark.parametrize("penalty", penalties[1:]) # don't test iris with none penalty
# @pytest.mark.parametrize("C", grid_C)
# @pytest.mark.parametrize("l1_ratio", grid_l1_ratio)
# @pytest.mark.parametrize("solver", solvers)
# def test_fit_same_sklearn_iris(
#     fit_intercept, penalty, C, l1_ratio, solver
# ):
#     """
#     This is a test that checks on many combinations that Classifier gets the
#     same coef_ and intercept_ as scikit-learn on the iris dataset
#     """
#     tol = 1e-10
#     max_iter = 400  # so many iterations needed to reach necessary precision ...
#     verbose = False
#     step_size = 1.0
#
#     X, y = load_iris(return_X_y=True)
#     mean = X.mean(axis=0)
#     std = X.std(axis=0)
#     X = (X - mean) / std
#     # std_scaler = StandardScaler()
#     # X = std_scaler.fit_transform(X)
#
#     args = {
#         "tol": tol,
#         "max_iter": max_iter,
#         "verbose": verbose,
#         "fit_intercept": fit_intercept,
#         "random_state": 42,
#         "multi_class": "multinomial",
#     }
#
#     # if solver in ["svrg", "saga", "gd"] and fit_intercept:
#     #     abs_approx, rel_approx = 1e-4, 1e-4
#     # else:
#     #     abs_approx, rel_approx = 1e-6, 1e-6
#     abs_approx, rel_approx = 1e-3, 1e-3 #######################################
#
#     if penalty == "elasticnet" and l1_ratio == 0.5:
#         step_size = 1.5
#     if (penalty == "l1") or (penalty == "elasticnet" and l1_ratio == 0.9):
#         step_size = 2.0
#         args["max_iter"] = 1200
#     if penalty == "l1" and fit_intercept:
#         step_size = 3.5
#         args["max_iter"] = 1500
#         abs_approx, rel_approx = 1e-2, 1e-2 #######################################
#
#     if penalty == "none":
#         # A single test is required for penalty="none"
#         if C != 1.0 or l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, solver="saga", **args)
#     elif penalty == "l2":
#         if l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, C=C, solver="saga", **args)
#     elif penalty == "l1":
#         if l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, C=C, solver="saga", **args)
#     elif penalty == "elasticnet":
#         clf_scikit = LogisticRegression(
#             penalty=penalty, C=C, solver="saga", l1_ratio=l1_ratio, **args
#         )
#     else:
#         raise ValueError("Weird penalty %r" % penalty)
#
#     if solver == "gd":
#         step_size *= 2.6
#     elif solver in ["svrg", "saga"]:
#         step_size *= 4.0
#
#     clf_scikit.fit(X, y)
#     # We compare with saga since it supports all penalties
#     # clf_scikit = LogisticRegression(solver="saga", **args).fit(X, y)
#     args.pop("multi_class")
#     clf_linlearn = Classifier(
#         penalty=penalty,
#         loss="multilogistic",
#         C=C,
#         step_size=step_size,
#         l1_ratio=l1_ratio,
#         solver=solver,
#         **args
#     )
#     clf_linlearn.fit(X, y)
#
#     # And always test the coef_
#
#     assert clf_scikit.coef_ == pytest.approx(
#         clf_linlearn.coef_, abs=abs_approx, rel=rel_approx
#     )
#
#     # For some weird reason scikit's intercept_ does not match for "l1" and
#     # "elasticnet" with intercept and for small C
#     if not (penalty in ["l1", "elasticnet"] and fit_intercept and C < 1e-1):
#         # Test the intercept_
#         assert clf_scikit.intercept_ == pytest.approx(
#             clf_linlearn.intercept_, abs=abs_approx, rel=rel_approx
#         )
#         # And test prediction methods
#         assert clf_scikit.decision_function(X) == pytest.approx(
#             clf_linlearn.decision_function(X), abs=abs_approx, rel=rel_approx
#         )
#         assert clf_scikit.predict_proba(X) == pytest.approx(
#             clf_linlearn.predict_proba(X), abs=abs_approx, rel=rel_approx
#         )
#         assert clf_scikit.predict_log_proba(X) == pytest.approx(
#             clf_linlearn.predict_log_proba(X), abs=abs_approx, rel=rel_approx
#         )
#         assert (clf_scikit.predict(X) == clf_linlearn.predict(X)).any()
#         assert clf_scikit.score(X, y) == clf_linlearn.score(X, y)
#
#
# @pytest.mark.parametrize("fit_intercept", (False, True))
# @pytest.mark.parametrize("penalty", penalties[1:]) # don't test the wine dataset with no penalty
# @pytest.mark.parametrize("C", grid_C)
# @pytest.mark.parametrize("l1_ratio", grid_l1_ratio)
# @pytest.mark.parametrize("solver", solvers)
# def test_fit_same_sklearn_wine(
#     fit_intercept, penalty, C, l1_ratio, solver
# ):
#     """
#     This is a test that checks on many combinations that Classifier gets the
#     same coef_ and intercept_ as scikit-learn on the iris dataset
#     """
#     tol = 1e-10
#     max_iter = 400  # so many iterations needed to reach necessary precision ...
#     verbose = False
#     step_size = 1.0
#
#     X, y = load_wine(return_X_y=True)
#     mean = X.mean(axis=0)
#     std = X.std(axis=0)
#     X = (X - mean) / std
#     # std_scaler = StandardScaler()
#     # X = std_scaler.fit_transform(X)
#
#     args = {
#         "tol": tol,
#         "max_iter": max_iter,
#         "verbose": verbose,
#         "fit_intercept": fit_intercept,
#         "random_state": 42,
#         "multi_class": "multinomial",
#     }
#
#     # if solver in ["svrg", "saga", "gd"] and fit_intercept:
#     #     abs_approx, rel_approx = 1e-4, 1e-4
#     # else:
#     #     abs_approx, rel_approx = 1e-6, 1e-6
#     abs_approx, rel_approx = 1e-3, 1e-3
#
#     if penalty == "l2" and C == 1.0 and fit_intercept:
#         step_size = 2.0
#     if penalty == "l1" and C == 1.0:
#         step_size = 2.0
#         args["max_iter"] = 900
#     if penalty == "elasticnet" and C == 1.0:
#         step_size = 2.0
#         args["max_iter"] = 600
#         if solver == "gd" and l1_ratio == 0.9:
#             abs_approx, rel_approx = 1e-2, 1e-2
#
#     if penalty == "none":
#         # A single test is required for penalty="none"
#         if C != 1.0 or l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, solver="saga", **args)
#     elif penalty == "l2":
#         if l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, C=C, solver="saga", **args)
#     elif penalty == "l1":
#         if l1_ratio != 0.5:
#             return
#         clf_scikit = LogisticRegression(penalty=penalty, C=C, solver="saga", **args)
#     elif penalty == "elasticnet":
#         clf_scikit = LogisticRegression(
#             penalty=penalty, C=C, solver="saga", l1_ratio=l1_ratio, **args
#         )
#     else:
#         raise ValueError("Weird penalty %r" % penalty)
#
#     if solver == "gd":
#         step_size *= 5.0
#     elif solver in ["svrg", "saga"]:
#         step_size *= 15.5
#
#     clf_scikit.fit(X, y)
#     # We compare with saga since it supports all penalties
#     # clf_scikit = LogisticRegression(solver="saga", **args).fit(X, y)
#     args.pop("multi_class")
#     clf_linlearn = Classifier(
#         penalty=penalty,
#         loss="multilogistic",
#         step_size=step_size,
#         C=C,
#         l1_ratio=l1_ratio,
#         solver=solver,
#         **args
#     )
#     clf_linlearn.fit(X, y)
#
#     # And always test the coef_
#
#     assert clf_scikit.coef_ == pytest.approx(
#         clf_linlearn.coef_, abs=abs_approx, rel=rel_approx
#     )
#
#     # For some weird reason scikit's intercept_ does not match for "l1" and
#     # "elasticnet" with intercept and for small C
#     if not (penalty in ["l1", "elasticnet"] and fit_intercept and C < 1e-1):
#         # Test the intercept_
#         assert clf_scikit.intercept_ == pytest.approx(
#             clf_linlearn.intercept_, abs=abs_approx, rel=rel_approx
#         )
#         # And test prediction methods
#         assert clf_scikit.decision_function(X) == pytest.approx(
#             clf_linlearn.decision_function(X), abs=abs_approx, rel=rel_approx
#         )
#         assert clf_scikit.predict_proba(X) == pytest.approx(
#             clf_linlearn.predict_proba(X), abs=abs_approx, rel=rel_approx
#         )
#         assert clf_scikit.predict_log_proba(X) == pytest.approx(
#             clf_linlearn.predict_log_proba(X), abs=abs_approx, rel=rel_approx
#         )
#         assert (clf_scikit.predict(X) == clf_linlearn.predict(X)).any()
#         assert clf_scikit.score(X, y) == clf_linlearn.score(X, y)