Ejemplo n.º 1
0
def test_random_state_no_seed_different_params():
    # Guarantees that specifying a random state relies in the successive
    # halving's having the same random state (and likewise for no random state)

    # This test is required because Hyperband wraps SHAs and the random state
    # needs to be passed correctly.
    values = scipy.stats.uniform(0, 1)
    max_iter = 9
    brackets = _get_hyperband_params(max_iter)
    kwargs = {"value": values}

    h1 = HyperbandSearchCV(ConstantFunction(), kwargs, max_iter=max_iter)
    h2 = HyperbandSearchCV(ConstantFunction(), kwargs, max_iter=max_iter)
    h1._get_SHAs(brackets)
    h2._get_SHAs(brackets)
    assert h1._SHA_seed != h2._SHA_seed

    h1 = HyperbandSearchCV(ConstantFunction(),
                           kwargs,
                           max_iter=9,
                           random_state=0)
    h2 = HyperbandSearchCV(ConstantFunction(),
                           kwargs,
                           max_iter=9,
                           random_state=0)
    h1._get_SHAs(brackets)
    h2._get_SHAs(brackets)
    assert h1._SHA_seed == h2._SHA_seed
Ejemplo n.º 2
0
 def _test_verbosity(c, s, a, b):
     X, y = make_classification(n_samples=10, n_features=4, chunks=10)
     model = ConstantFunction()
     params = {"value": scipy.stats.uniform(0, 1)}
     search = IncrementalSearchCV(model, params, max_iter=max_iter, verbose=verbose)
     yield search.fit(X, y)
     return search
Ejemplo n.º 3
0
def test_params_passed():
    # This makes sure that the "SuccessiveHalvingSearchCV params" key in
    # Hyperband.metadata["brackets"] is correct and they can be instatiated.
    est = ConstantFunction(value=0.4)
    params = {"value": np.linspace(0, 1)}
    params = {
        "aggressiveness": 3.5,
        "max_iter": 253,
        "random_state": 42,
        "scoring": False,
        "test_size": 0.212,
        "tol": 0,
    }
    params["patience"] = (params["max_iter"] // params["aggressiveness"]) + 4
    hyperband = HyperbandSearchCV(est, params, **params)

    for k, v in params.items():
        assert getattr(hyperband, k) == v

    brackets = hyperband.metadata["brackets"]
    SHAs_params = [
        bracket["SuccessiveHalvingSearchCV params"] for bracket in brackets
    ]

    for SHA_params in SHAs_params:
        for k, v in params.items():
            if k == "random_state":
                continue
            assert SHA_params[k] == v
    seeds = [SHA_params["random_state"] for SHA_params in SHAs_params]
    assert len(set(seeds)) == len(seeds)
Ejemplo n.º 4
0
def test_successive_halving_params(c, s, a, b):
    # Makes sure when SHAs are fit with values from the "SuccessiveHalvingSearchCV
    # params" key, the number of models/calls stay the same as Hyperband.

    # This sanity check again makes sure parameters passed correctly
    # (similar to `test_params_passed`)
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=27, random_state=42)

    kwargs = [
        v["SuccessiveHalvingSearchCV params"] for v in alg.metadata["brackets"]
    ]
    SHAs = [SuccessiveHalvingSearchCV(model, params, **v) for v in kwargs]

    metadata = alg.metadata["brackets"]
    for k, (true_meta, SHA) in enumerate(zip(metadata, SHAs)):
        yield SHA.fit(X, y)
        n_models = len(SHA.model_history_)
        pf_calls = [
            v[-1]["partial_fit_calls"] for v in SHA.model_history_.values()
        ]
        assert true_meta["n_models"] == n_models
        assert true_meta["partial_fit_calls"] == sum(pf_calls)
Ejemplo n.º 5
0
    def _test_mirrors_paper(c, s, a, b):
        X, y = make_classification(n_samples=10, n_features=4, chunks=10)
        model = ConstantFunction()
        params = {"value": np.random.rand(max_iter)}
        alg = HyperbandSearchCV(
            model,
            params,
            max_iter=max_iter,
            random_state=0,
            aggressiveness=aggressiveness,
        )
        yield alg.fit(X, y)

        assert alg.metadata == alg.metadata_

        assert isinstance(alg.metadata["brackets"], list)
        assert set(alg.metadata.keys()) == {
            "n_models", "partial_fit_calls", "brackets"
        }

        # Looping over alg.metadata["bracketes"] is okay because alg.metadata
        # == alg.metadata_
        for bracket in alg.metadata["brackets"]:
            assert set(bracket.keys()) == {
                "n_models",
                "partial_fit_calls",
                "bracket",
                "SuccessiveHalvingSearchCV params",
                "decisions",
            }

        if aggressiveness == 3:
            assert alg.best_score_ == params["value"].max()
Ejemplo n.º 6
0
 async def _test_verbosity(c, s, a, b):
     X, y = make_classification(n_samples=10, n_features=4, chunks=10)
     model = ConstantFunction()
     params = {"value": scipy.stats.uniform(0, 1)}
     search = Search(model, params, max_iter=max_iter, verbose=verbose)
     await search.fit(X, y)
     assert search.best_score_ > 0  # ensure search ran
     return search
Ejemplo n.º 7
0
def test_warns_scores_per_fit(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(model, params, scores_per_fit=2)
    with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"):
        yield search.fit(X, y)
Ejemplo n.º 8
0
def test_min_max_iter(c, s, a, b):
    # This test makes sure Hyperband works with max_iter=1.
    # Tests for max_iter < 1 are in test_incremental.py.
    values = scipy.stats.uniform(0, 1)
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)

    max_iter = 1
    h = HyperbandSearchCV(ConstantFunction(), {"value": values}, max_iter=max_iter)
    yield h.fit(X, y)
    assert h.best_score_ > 0
Ejemplo n.º 9
0
async def test_dataframe_inputs(c, s, a, b):
    X = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    X = dd.from_pandas(X, npartitions=2)
    y = pd.Series([False, True, True])
    y = dd.from_pandas(y, npartitions=2)

    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    await alg.fit(X, y)
Ejemplo n.º 10
0
async def test_model_future(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()
    model_future = await c.scatter(model)

    search = IncrementalSearchCV(model_future, params, max_iter=10)

    await search.fit(X, y, classes=[0, 1])
    assert search.history_
    assert search.best_score_ > 0
Ejemplo n.º 11
0
def test_hyperband_patience(c, s, a, b):
    # Test to make sure that specifying patience=True results in less
    # computation
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    max_iter = 27

    alg = HyperbandSearchCV(model,
                            params,
                            max_iter=max_iter,
                            patience=True,
                            random_state=0)
    yield alg.fit(X, y)

    alg_patience = max_iter // alg.aggressiveness
    actual_decisions = [b.pop("decisions") for b in alg.metadata_["brackets"]]
    paper_decisions = [b.pop("decisions") for b in alg.metadata["brackets"]]

    for paper_iter, actual_iter in zip(paper_decisions, actual_decisions):
        trimmed_paper_iter = {k for k in paper_iter if k <= alg_patience}

        # This makes sure that the algorithm is executed faithfully when
        # patience=True (and the proper decision points are preserved even if
        # other stop-on-plateau points are added)
        assert trimmed_paper_iter.issubset(set(actual_iter))

        # This makes sure models aren't trained for too long
        assert all(x <= alg_patience + 1 for x in actual_iter)

    assert alg.metadata_["partial_fit_calls"] <= alg.metadata[
        "partial_fit_calls"]
    assert alg.best_score_ >= 0.9

    max_iter = 6
    kwargs = dict(max_iter=max_iter, aggressiveness=2)
    alg = HyperbandSearchCV(model, params, patience=2, **kwargs)
    with pytest.warns(UserWarning, match="The goal of `patience`"):
        yield alg.fit(X, y)

    alg = HyperbandSearchCV(model, params, patience=2, tol=np.nan, **kwargs)
    yield alg.fit(X, y)
    assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter

    alg = HyperbandSearchCV(model, params, patience=2, tol=None, **kwargs)
    yield alg.fit(X, y)
    assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter

    alg = HyperbandSearchCV(model, params, patience=1, **kwargs)
    with pytest.raises(ValueError, match="always detect a plateau"):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            yield alg.fit(X, y)
Ejemplo n.º 12
0
def test_cv_results_order_preserved(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)

    info = {k: v[-1] for k, v in alg.model_history_.items()}
    for _, row in pd.DataFrame(alg.cv_results_).iterrows():
        model_info = info[row["model_id"]]
        assert row["bracket"] == model_info["bracket"]
        assert row["params"] == model_info["params"]
        assert np.allclose(row["test_score"], model_info["score"])
Ejemplo n.º 13
0
def test_verbosity_types(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}

    for verbose in [-1.0, 1.2]:
        search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3)
        with pytest.raises(ValueError, match="0 <= verbose <= 1"):
            yield search.fit(X, y)

    for verbose in [0.0, 0, 1, 1.0, True, False]:
        search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3)
        yield search.fit(X, y)
Ejemplo n.º 14
0
def test_same_random_state_same_params(c, s, a, b):
    # This makes sure parameters are sampled correctly when random state is
    # specified.

    # This test makes sure random state is *correctly* passed to successive
    # halvings from Hyperband
    seed = 0
    values = scipy.stats.uniform(0, 1)
    h = HyperbandSearchCV(ConstantFunction(), {"value": values},
                          random_state=seed,
                          max_iter=9)

    # Make a class for passive random sampling
    passive = IncrementalSearchCV(
        ConstantFunction(),
        {"value": values},
        random_state=seed,
        max_iter=2,
        n_initial_parameters=h.metadata["n_models"],
    )
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    yield h.fit(X, y)
    yield passive.fit(X, y)

    # Check to make sure the Hyperbands found the same params
    v_h = h.cv_results_["param_value"]

    # Check to make sure the random passive search had *some* of the same params
    v_passive = passive.cv_results_["param_value"]
    # Sanity checks to make sure all unique floats
    assert len(set(v_passive)) == len(v_passive)
    assert len(set(v_h)) == len(v_h)

    # Getting the `value`s that are the same for both searches
    same = set(v_passive).intersection(set(v_h))

    passive_models = h.metadata["brackets"][0]["n_models"]
    assert len(same) == passive_models
Ejemplo n.º 15
0
async def test_warns_decay_rate(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    kwargs = dict(max_iter=5, n_initial_parameters=5)
    search = IncrementalSearchCV(model, params, **kwargs)
    match = r"deprecated since Dask-ML v1.4.0."
    with pytest.warns(FutureWarning, match=match):
        await search.fit(X, y)

    # Make sure the printed warning message works
    search = IncrementalSearchCV(model, params, decay_rate=None, **kwargs)
    await search.fit(X, y)
Ejemplo n.º 16
0
def test_warns_decay_rate_wanted(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(
        model, params, max_iter=5, n_initial_parameters=5, decay_rate=1
    )
    match = "decay_rate is deprecated .* Use InverseDecaySearchCV"
    with pytest.warns(FutureWarning, match=match):
        yield search.fit(X, y)

    # Make sure old behavior is retained w/o warning
    search = InverseDecaySearchCV(model, params, decay_rate=1)
    yield search.fit(X, y)
Ejemplo n.º 17
0
def test_search_patience_infeasible_tol(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    rng = check_random_state(42)
    params = {"value": rng.rand(1000)}
    model = ConstantFunction()

    max_iter = 10
    score_increase = -10
    search = IncrementalSearchCV(
        model, params, max_iter=max_iter, patience=3, tol=score_increase,
    )
    yield search.fit(X, y, classes=[0, 1])

    hist = pd.DataFrame(search.history_)
    assert hist.partial_fit_calls.max() == max_iter
Ejemplo n.º 18
0
def test_correct_params(c, s, a, b):
    # Makes sure that Hyperband has the correct parameters.

    # Implemented because Hyperband wraps SHA. Again, makes sure that parameters
    # are correctly passed to SHA (had a case where max_iter= flag not passed to
    # SuccessiveHalvingSearchCV but it should have been)
    est = ConstantFunction()
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    params = {"value": np.linspace(0, 1)}
    search = HyperbandSearchCV(est, params, max_iter=9)

    base = {
        "estimator",
        "estimator__value",
        "estimator__sleep",
        "parameters",
        "max_iter",
        "test_size",
        "patience",
        "tol",
        "random_state",
        "scoring",
        "verbose",
        "prefix",
    }
    assert set(search.get_params().keys()) == base.union({"aggressiveness"})
    meta = search.metadata
    SHAs_params = [
        bracket["SuccessiveHalvingSearchCV params"] for bracket in meta["brackets"]
    ]
    SHA_params = base.union(
        {
            "n_initial_parameters",
            "n_initial_iter",
            "aggressiveness",
            "max_iter",
            "prefix",
        }
    ) - {"estimator__sleep", "estimator__value", "estimator", "parameters"}

    assert all(set(SHA) == SHA_params for SHA in SHAs_params)

    # this is testing to make sure that each SHA has the correct estimator
    yield search.fit(X, y)
    SHAs = search._SuccessiveHalvings_
    assert all(search.estimator is SHA.estimator for SHA in SHAs.values())
    assert all(search.parameters is SHA.parameters for SHA in SHAs.values())
Ejemplo n.º 19
0
def test_search_invalid_patience(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(model, params, patience=1, max_iter=10)
    with pytest.raises(ValueError, match="patience >= 2"):
        yield search.fit(X, y, classes=[0, 1])

    search = IncrementalSearchCV(model, params, patience=2.0, max_iter=10)
    with pytest.raises(ValueError, match="patience must be an integer"):
        yield search.fit(X, y, classes=[0, 1])

    # Make sure this passes
    search = IncrementalSearchCV(model, params, patience=False, max_iter=10)
    yield search.fit(X, y, classes=[0, 1])
    assert search.history_
Ejemplo n.º 20
0
def test_search_patience_infeasible_tol(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = SuccessiveHalvingSearchCV(
        model,
        params,
        patience=2,
        tol=np.nan,
        n_initial_parameters=20,
        n_initial_iter=4,
        max_iter=1000,
    )
    yield search.fit(X, y, classes=[0, 1])

    assert search.metadata_["partial_fit_calls"] == search.metadata[
        "partial_fit_calls"]
    assert search.metadata_ == search.metadata
Ejemplo n.º 21
0
def test_history(c, s, a, b):
    # This test is required to make sure Hyperband wraps SHA successfully
    # Mostly, it's a test to make sure ordered by time
    #
    # There's also a test in test_incremental to make sure the history has
    # correct values/etc
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)

    assert alg.cv_results_["model_id"].dtype == "<U11"
    assert all(isinstance(v, str) for v in alg.cv_results_["model_id"])
    assert all("bracket=" in h["model_id"] for h in alg.history_)
    assert all("bracket" in h for h in alg.history_)

    # Hyperband does a custom ordering of times
    times = [v["elapsed_wall_time"] for v in alg.history_]
    assert (np.diff(times) >= 0).all()
    # Make sure results are ordered by partial fit calls for each model
    for model_hist in alg.model_history_.values():
        calls = [h["partial_fit_calls"] for h in model_hist]
        assert (np.diff(calls) >= 1).all() or len(calls) == 1
Ejemplo n.º 22
0
def test_logs_dont_repeat(c, s, a, b):
    # This test is necessary to make sure the dask_ml.model_selection logger
    # isn't piped to stdout repeatedly.
    #
    # I developed this test to protect against this case:
    # getLogger("dask_ml.model_selection") is piped to stdout whenever a
    # bracket of Hyperband starts/each time SHA._fit is called
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    search = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    with captured_logger(logging.getLogger("dask_ml.model_selection")) as logs:
        yield search.fit(X, y)
        assert search.best_score_ > 0  # ensure search ran
        messages = logs.getvalue().splitlines()
    model_creation_msgs = [m for m in messages if "creating" in m]
    n_models = [m.split(" ")[-2] for m in model_creation_msgs]

    bracket_models = [b["n_models"] for b in search.metadata["brackets"]]
    assert len(bracket_models) == len(set(bracket_models))

    # Make sure only one model creation message is printed per bracket
    # (all brackets have unique n_models as asserted above)
    assert len(n_models) == len(set(n_models))
Ejemplo n.º 23
0
    def _test_basic(c, s, a, b):
        rng = da.random.RandomState(42)

        n, d = (50, 2)
        # create observations we know linear models can fit
        X = rng.normal(size=(n, d), chunks=n // 2)
        coef_star = rng.uniform(size=d, chunks=d)
        y = da.sign(X.dot(coef_star))

        if array_type == "numpy":
            X, y = yield c.compute((X, y))

        params = {
            "loss":
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            "average": [True, False],
            "learning_rate": ["constant", "invscaling", "optimal"],
            "eta0":
            np.logspace(-2, 0, num=1000),
        }
        model = SGDClassifier(tol=-np.inf,
                              penalty="elasticnet",
                              random_state=42,
                              eta0=0.1)
        if library == "dask-ml":
            model = Incremental(model)
            params = {"estimator__" + k: v for k, v in params.items()}
        elif library == "ConstantFunction":
            model = ConstantFunction()
            params = {"value": np.linspace(0, 1, num=1000)}

        search = HyperbandSearchCV(model,
                                   params,
                                   max_iter=max_iter,
                                   random_state=42)
        classes = c.compute(da.unique(y))
        yield search.fit(X, y, classes=classes)

        if library == "dask-ml":
            X, y = yield c.compute((X, y))
        score = search.best_estimator_.score(X, y)
        assert score == search.score(X, y)
        assert 0 <= score <= 1

        if library == "ConstantFunction":
            assert score == search.best_score_
        else:
            # These are not equal because IncrementalSearchCV uses a train/test
            # split and we're testing on the entire train dataset, not only the
            # validation/test set.
            assert abs(score - search.best_score_) < 0.1

        assert type(search.best_estimator_) == type(model)
        assert isinstance(search.best_params_, dict)

        num_fit_models = len(set(search.cv_results_["model_id"]))
        num_pf_calls = sum([
            v[-1]["partial_fit_calls"] for v in search.model_history_.values()
        ])
        models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143}
        pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581}
        assert num_fit_models == models[max_iter]
        assert num_pf_calls == pf_calls[max_iter]

        best_idx = search.best_index_
        if isinstance(model, ConstantFunction):
            assert search.cv_results_["test_score"][best_idx] == max(
                search.cv_results_["test_score"])
        model_ids = {h["model_id"] for h in search.history_}

        if math.log(max_iter, 3) % 1.0 == 0:
            # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a
            # power of search.aggressiveness
            # In this case, assert that more models are tried then the max_iter
            assert len(model_ids) > max_iter
        else:
            # Otherwise, give some padding "almost as many estimators are tried
            # as max_iter". 3 is a fudge number chosen to be the minimum; when
            # max_iter=20, len(model_ids) == 17.
            assert len(model_ids) + 3 >= max_iter

        assert all("bracket" in id_ for id_ in model_ids)
Ejemplo n.º 24
0
async def test_basic(c, s, a, b):
    def _additional_calls(info):
        pf_calls = {k: v[-1]["partial_fit_calls"] for k, v in info.items()}
        ret = {k: int(calls < 10) for k, calls in pf_calls.items()}
        if len(ret) == 1:
            return {list(ret)[0]: 0}

        # Don't train one model (but keep model 0)
        some_keys = set(ret.keys()) - {0}
        key_to_drop = random.choice(list(some_keys))
        return {k: v for k, v in ret.items() if k != key_to_drop}

    X, y = make_classification(n_samples=1000, n_features=5, chunks=100)
    model = ConstantFunction()

    params = {"value": uniform(0, 1)}

    X_test, y_test = X[:100], y[:100]
    X_train = X[100:]
    y_train = y[100:]

    n_parameters = 5
    param_list = list(ParameterSampler(params, n_parameters))

    info, models, history, best = await fit(
        model,
        param_list,
        X_train,
        y_train,
        X_test,
        y_test,
        _additional_calls,
        fit_params={"classes": [0, 1]},
    )

    # Ensure that we touched all data
    keys = {t[0] for t in s.transition_log}
    L = [str(k) in keys for kk in X_train.__dask_keys__() for k in kk]
    assert all(L)

    for model in models.values():
        assert isinstance(model, Future)
        model2 = await model
        assert isinstance(model2, ConstantFunction)

    XX_test = await c.compute(X_test)
    yy_test = await c.compute(y_test)
    model = await models[0]
    assert model.score(XX_test, yy_test) == info[0][-1]["score"]

    # `<` not `==` because we randomly dropped one model every iteration
    assert len(history) < n_parameters * 10
    for h in history:
        assert {
            "partial_fit_time",
            "score_time",
            "score",
            "model_id",
            "params",
            "partial_fit_calls",
        }.issubset(set(h.keys()))

    groups = toolz.groupby("partial_fit_calls", history)
    assert len(groups[1]) > len(groups[2]) > len(groups[3]) > len(groups[max(groups)])
    assert max(groups) == n_parameters

    keys = list(models.keys())
    for key in keys:
        del models[key]

    while c.futures or s.tasks:  # Make sure cleans up cleanly after running
        await asyncio.sleep(0.1)

    # smoke test for ndarray X_test and y_test
    X_test = await c.compute(X_test)
    y_test = await c.compute(y_test)
    info, models, history, best = await fit(
        model,
        param_list,
        X_train,
        y_train,
        X_test,
        y_test,
        _additional_calls,
        fit_params={"classes": [0, 1]},
    )
    assert True  # smoke test to make sure reached
Ejemplo n.º 25
0
def test_history(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = IncrementalSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)
    gt_zero = lambda x: x >= 0
    gt_one = lambda x: x >= 1

    key_types_and_checks = [
        ("mean_partial_fit_time", float, gt_zero),
        ("mean_score_time", float, gt_zero),
        ("std_partial_fit_time", float, gt_zero),
        ("std_score_time", float, gt_zero),
        ("test_score", float, gt_zero),
        ("rank_test_score", int, gt_one),
        ("model_id", int, None),
        ("partial_fit_calls", int, gt_zero),
        ("params", dict, lambda d: set(d.keys()) == {"value"}),
        ("param_value", float, gt_zero),
    ]
    assert set(alg.cv_results_) == {v[0] for v in key_types_and_checks}
    for column, dtype, condition in key_types_and_checks:
        if dtype:
            assert alg.cv_results_[column].dtype == dtype
        if condition:
            assert all(condition(x) for x in alg.cv_results_[column])

    alg.best_estimator_.fit(X, y)
    alg.best_estimator_.score(X, y)
    alg.score(X, y)

    # Test types/format of all parameters we set after fitting
    assert isinstance(alg.best_index_, int)
    assert isinstance(alg.best_estimator_, ConstantFunction)
    assert isinstance(alg.best_score_, float)
    assert isinstance(alg.best_params_, dict)
    assert isinstance(alg.history_, list)
    assert all(isinstance(h, dict) for h in alg.history_)
    assert isinstance(alg.model_history_, dict)
    assert all(vi in alg.history_ for v in alg.model_history_.values()
               for vi in v)
    assert all(isinstance(v, np.ndarray) for v in alg.cv_results_.values())
    assert isinstance(alg.multimetric_, bool)

    keys = {
        "score",
        "score_time",
        "partial_fit_calls",
        "partial_fit_time",
        "model_id",
        "elapsed_wall_time",
        "params",
    }
    assert all(set(h.keys()) == keys for h in alg.history_)
    times = [v["elapsed_wall_time"] for v in alg.history_]
    assert (np.diff(times) >= 0).all()

    # Test to make sure history_ ordered with wall time
    assert (np.diff([v["elapsed_wall_time"] for v in alg.history_]) >= 0).all()
    for model_hist in alg.model_history_.values():
        calls = [h["partial_fit_calls"] for h in model_hist]
        assert (np.diff(calls) >= 1).all() or len(calls) == 1