Beispiel #1
0
async def test_model_random_determinism(c, s, a, b):
    # choose so d == n//10. Then each partial_fit call is very
    # unstable, so models will vary a lot.
    n, d = 50, 5
    X, y = make_classification(n_samples=n,
                               n_features=d,
                               chunks=n // 10,
                               random_state=0)
    params = {
        "loss":
        ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
        "average": [True, False],
        "learning_rate": ["constant", "invscaling", "optimal"],
        "eta0": np.logspace(-2, 0, num=1000),
    }

    model = SGDClassifier(random_state=1)
    kwargs = dict(n_initial_parameters=10, random_state=2, max_iter=10)

    search1 = InverseDecaySearchCV(model, params, **kwargs)
    await search1.fit(X, y, classes=[0, 1])

    search2 = InverseDecaySearchCV(clone(model), params, **kwargs)
    await search2.fit(X, y, classes=[0, 1])

    assert search1.best_score_ == search2.best_score_
    assert search1.best_params_ == search2.best_params_
    assert np.allclose(search1.best_estimator_.coef_,
                       search2.best_estimator_.coef_)
Beispiel #2
0
async def test_same_params_with_random_state(c, s, a, b):
    X, y = make_classification(n_samples=100,
                               n_features=10,
                               chunks=10,
                               random_state=0)
    model = SGDClassifier(tol=1e-3, penalty="elasticnet", random_state=1)
    params = {"alpha": scipy.stats.uniform(1e-4, 1)}

    # Use InverseDecaySearchCV to decay the models and make sure the same ones
    # are selected
    kwargs = dict(n_initial_parameters=10, random_state=2)

    search1 = InverseDecaySearchCV(clone(model), params, **kwargs)
    await search1.fit(X, y, classes=[0, 1])
    params1 = search1.cv_results_["param_alpha"]

    search2 = InverseDecaySearchCV(clone(model), params, **kwargs)
    await search2.fit(X, y, classes=[0, 1])
    params2 = search2.cv_results_["param_alpha"]

    assert np.allclose(params1, params2)
Beispiel #3
0
def test_warns_decay_rate_wanted(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(
        model, params, max_iter=5, n_initial_parameters=5, decay_rate=1
    )
    match = "decay_rate is deprecated .* Use InverseDecaySearchCV"
    with pytest.warns(FutureWarning, match=match):
        yield search.fit(X, y)

    # Make sure old behavior is retained w/o warning
    search = InverseDecaySearchCV(model, params, decay_rate=1)
    yield search.fit(X, y)
Beispiel #4
0
async def _test_search_basic(decay_rate, input_type, memory, c, s, a, b):
    X, y = make_classification(n_samples=1000, n_features=5, chunks=(100, 5))
    assert isinstance(X, da.Array)
    if memory == "distributed" and input_type == "dataframe":
        X = dd.from_array(X)
        y = dd.from_array(y)
        assert isinstance(X, dd.DataFrame)
    elif memory == "local":
        X, y = await c.compute([X, y])
        assert isinstance(X, np.ndarray)
        if input_type == "dataframe":
            X, y = pd.DataFrame(X), pd.DataFrame(y)
            assert isinstance(X, pd.DataFrame)

    model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet")

    params = {"alpha": np.logspace(-2, 2, 100), "l1_ratio": np.linspace(0.01, 1, 200)}

    kwargs = dict(n_initial_parameters=20, max_iter=10)
    if decay_rate == 0:
        search = IncrementalSearchCV(model, params, **kwargs)
    elif decay_rate == 1:
        search = InverseDecaySearchCV(model, params, **kwargs)
    else:
        raise ValueError()
    await search.fit(X, y, classes=[0, 1])

    assert search.history_
    for d in search.history_:
        assert d["partial_fit_calls"] <= search.max_iter + 1
    assert isinstance(search.best_estimator_, SGDClassifier)
    assert search.best_score_ > 0
    assert "visualize" not in search.__dict__
    assert search.best_params_
    assert search.cv_results_ and isinstance(search.cv_results_, dict)
    assert {
        "mean_partial_fit_time",
        "mean_score_time",
        "std_partial_fit_time",
        "std_score_time",
        "test_score",
        "rank_test_score",
        "model_id",
        "params",
        "partial_fit_calls",
        "param_alpha",
        "param_l1_ratio",
    }.issubset(set(search.cv_results_.keys()))
    assert len(search.cv_results_["param_alpha"]) == 20

    assert all(isinstance(v, np.ndarray) for v in search.cv_results_.values())
    if decay_rate == 0:
        assert (
            search.cv_results_["test_score"][search.best_index_]
            >= search.cv_results_["test_score"]
        ).all()
        assert search.cv_results_["rank_test_score"][search.best_index_] == 1
    else:
        assert all(search.cv_results_["test_score"] >= 0)
        assert all(search.cv_results_["rank_test_score"] >= 1)
    assert all(search.cv_results_["partial_fit_calls"] >= 1)
    assert len(np.unique(search.cv_results_["model_id"])) == len(
        search.cv_results_["model_id"]
    )
    assert sorted(search.model_history_.keys()) == list(range(20))
    assert set(search.model_history_[0][0].keys()) == {
        "model_id",
        "params",
        "partial_fit_calls",
        "partial_fit_time",
        "score",
        "score_time",
        "elapsed_wall_time",
    }

    # Dask Objects are lazy
    X_ = await c.compute(X)

    proba = search.predict_proba(X)
    log_proba = search.predict_log_proba(X)
    assert proba.shape[1] == 2
    assert proba.shape[0] == 1000 or math.isnan(proba.shape[0])
    assert log_proba.shape[1] == 2
    assert log_proba.shape[0] == 1000 or math.isnan(proba.shape[0])

    assert isinstance(proba, da.Array)
    assert isinstance(log_proba, da.Array)

    proba_ = search.predict_proba(X_)
    log_proba_ = search.predict_log_proba(X_)

    da.utils.assert_eq(proba, proba_)
    da.utils.assert_eq(log_proba, log_proba_)

    decision = search.decision_function(X_)
    assert decision.shape == (1000,) or math.isnan(decision.shape[0])
    return True