コード例 #1
0
    def _test_mirrors_paper(c, s, a, b):
        X, y = make_classification(n_samples=10, n_features=4, chunks=10)
        model = ConstantFunction()
        params = {"value": np.random.rand(max_iter)}
        alg = HyperbandSearchCV(
            model,
            params,
            max_iter=max_iter,
            random_state=0,
            aggressiveness=aggressiveness,
        )
        yield alg.fit(X, y)

        assert alg.metadata == alg.metadata_

        assert isinstance(alg.metadata["brackets"], list)
        assert set(alg.metadata.keys()) == {
            "n_models", "partial_fit_calls", "brackets"
        }

        # Looping over alg.metadata["bracketes"] is okay because alg.metadata
        # == alg.metadata_
        for bracket in alg.metadata["brackets"]:
            assert set(bracket.keys()) == {
                "n_models",
                "partial_fit_calls",
                "bracket",
                "SuccessiveHalvingSearchCV params",
                "decisions",
            }

        if aggressiveness == 3:
            assert alg.best_score_ == params["value"].max()
コード例 #2
0
ファイル: test_hyperband.py プロジェクト: stsievert/dask-ml
def test_min_max_iter(c, s, a, b):
    # This test makes sure Hyperband works with max_iter=1.
    # Tests for max_iter < 1 are in test_incremental.py.
    values = scipy.stats.uniform(0, 1)
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)

    max_iter = 1
    h = HyperbandSearchCV(ConstantFunction(), {"value": values}, max_iter=max_iter)
    yield h.fit(X, y)
    assert h.best_score_ > 0
コード例 #3
0
def test_cv_results_order_preserved(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)

    info = {k: v[-1] for k, v in alg.model_history_.items()}
    for _, row in pd.DataFrame(alg.cv_results_).iterrows():
        model_info = info[row["model_id"]]
        assert row["bracket"] == model_info["bracket"]
        assert row["params"] == model_info["params"]
        assert np.allclose(row["test_score"], model_info["score"])
コード例 #4
0
def test_params_passed():
    # This makes sure that the "SuccessiveHalvingSearchCV params" key in
    # Hyperband.metadata["brackets"] is correct and they can be instatiated.
    est = ConstantFunction(value=0.4)
    params = {"value": np.linspace(0, 1)}
    params = {
        "aggressiveness": 3.5,
        "max_iter": 253,
        "random_state": 42,
        "scoring": False,
        "test_size": 0.212,
        "tol": 0,
    }
    params["patience"] = (params["max_iter"] // params["aggressiveness"]) + 4
    hyperband = HyperbandSearchCV(est, params, **params)

    for k, v in params.items():
        assert getattr(hyperband, k) == v

    brackets = hyperband.metadata["brackets"]
    SHAs_params = [
        bracket["SuccessiveHalvingSearchCV params"] for bracket in brackets
    ]

    for SHA_params in SHAs_params:
        for k, v in params.items():
            if k == "random_state":
                continue
            assert SHA_params[k] == v
    seeds = [SHA_params["random_state"] for SHA_params in SHAs_params]
    assert len(set(seeds)) == len(seeds)
コード例 #5
0
def test_successive_halving_params(c, s, a, b):
    # Makes sure when SHAs are fit with values from the "SuccessiveHalvingSearchCV
    # params" key, the number of models/calls stay the same as Hyperband.

    # This sanity check again makes sure parameters passed correctly
    # (similar to `test_params_passed`)
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=27, random_state=42)

    kwargs = [
        v["SuccessiveHalvingSearchCV params"] for v in alg.metadata["brackets"]
    ]
    SHAs = [SuccessiveHalvingSearchCV(model, params, **v) for v in kwargs]

    metadata = alg.metadata["brackets"]
    for k, (true_meta, SHA) in enumerate(zip(metadata, SHAs)):
        yield SHA.fit(X, y)
        n_models = len(SHA.model_history_)
        pf_calls = [
            v[-1]["partial_fit_calls"] for v in SHA.model_history_.values()
        ]
        assert true_meta["n_models"] == n_models
        assert true_meta["partial_fit_calls"] == sum(pf_calls)
コード例 #6
0
ファイル: test_hyperband.py プロジェクト: stsievert/dask-ml
def test_correct_params(c, s, a, b):
    # Makes sure that Hyperband has the correct parameters.

    # Implemented because Hyperband wraps SHA. Again, makes sure that parameters
    # are correctly passed to SHA (had a case where max_iter= flag not passed to
    # SuccessiveHalvingSearchCV but it should have been)
    est = ConstantFunction()
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    params = {"value": np.linspace(0, 1)}
    search = HyperbandSearchCV(est, params, max_iter=9)

    base = {
        "estimator",
        "estimator__value",
        "estimator__sleep",
        "parameters",
        "max_iter",
        "test_size",
        "patience",
        "tol",
        "random_state",
        "scoring",
        "verbose",
        "prefix",
    }
    assert set(search.get_params().keys()) == base.union({"aggressiveness"})
    meta = search.metadata
    SHAs_params = [
        bracket["SuccessiveHalvingSearchCV params"] for bracket in meta["brackets"]
    ]
    SHA_params = base.union(
        {
            "n_initial_parameters",
            "n_initial_iter",
            "aggressiveness",
            "max_iter",
            "prefix",
        }
    ) - {"estimator__sleep", "estimator__value", "estimator", "parameters"}

    assert all(set(SHA) == SHA_params for SHA in SHAs_params)

    # this is testing to make sure that each SHA has the correct estimator
    yield search.fit(X, y)
    SHAs = search._SuccessiveHalvings_
    assert all(search.estimator is SHA.estimator for SHA in SHAs.values())
    assert all(search.parameters is SHA.parameters for SHA in SHAs.values())
コード例 #7
0
def tune_dask(clf,
              params,
              X_train,
              y_train,
              X_test,
              y_test,
              n_params=-1,
              max_epochs=-1,
              n_jobs=4):
    clf = clone(clf).set_params(prefix="dask")

    chunk_size = len(X_train) * max_epochs // n_params
    max_iter = n_params

    print(f"max_iter, chunk_size = {max_iter}, {chunk_size}")
    print(f"n_params = {n_params}")
    X_train = da.from_array(X_train).rechunk(chunks=(chunk_size, -1))
    y_train = da.from_array(y_train).rechunk(chunks=chunk_size)
    print(y_train.chunks)

    search = HyperbandSearchCV(
        clf,
        params,
        max_iter=max_iter,
        random_state=42,
        test_size=0.2,
        aggressiveness=4,
    )
    meta = search.metadata
    hmeta = {k: meta[k] for k in ["n_models", "partial_fit_calls"]}

    start = time()
    search.fit(X_train, y_train)
    fit_time = time() - start

    data = {
        "library": "dask",
        "best_score": search.best_score_,
        "best_params": search.best_params_,
        "fit_time": fit_time,
        "start_time": start,
        "n_params": n_params,
        "n_jobs": n_jobs,
        "max_epochs": max_epochs,
        **hmeta,
    }
    return search, data
コード例 #8
0
async def test_dataframe_inputs(c, s, a, b):
    X = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    X = dd.from_pandas(X, npartitions=2)
    y = pd.Series([False, True, True])
    y = dd.from_pandas(y, npartitions=2)

    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    await alg.fit(X, y)
コード例 #9
0
def test_same_random_state_same_params(c, s, a, b):
    # This makes sure parameters are sampled correctly when random state is
    # specified.

    # This test makes sure random state is *correctly* passed to successive
    # halvings from Hyperband
    seed = 0
    values = scipy.stats.uniform(0, 1)
    h = HyperbandSearchCV(ConstantFunction(), {"value": values},
                          random_state=seed,
                          max_iter=9)

    # Make a class for passive random sampling
    passive = IncrementalSearchCV(
        ConstantFunction(),
        {"value": values},
        random_state=seed,
        max_iter=2,
        n_initial_parameters=h.metadata["n_models"],
    )
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    yield h.fit(X, y)
    yield passive.fit(X, y)

    # Check to make sure the Hyperbands found the same params
    v_h = h.cv_results_["param_value"]

    # Check to make sure the random passive search had *some* of the same params
    v_passive = passive.cv_results_["param_value"]
    # Sanity checks to make sure all unique floats
    assert len(set(v_passive)) == len(v_passive)
    assert len(set(v_h)) == len(v_h)

    # Getting the `value`s that are the same for both searches
    same = set(v_passive).intersection(set(v_h))

    passive_models = h.metadata["brackets"][0]["n_models"]
    assert len(same) == passive_models
コード例 #10
0
def test_history(c, s, a, b):
    # This test is required to make sure Hyperband wraps SHA successfully
    # Mostly, it's a test to make sure ordered by time
    #
    # There's also a test in test_incremental to make sure the history has
    # correct values/etc
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)

    assert alg.cv_results_["model_id"].dtype == "<U11"
    assert all(isinstance(v, str) for v in alg.cv_results_["model_id"])
    assert all("bracket=" in h["model_id"] for h in alg.history_)
    assert all("bracket" in h for h in alg.history_)

    # Hyperband does a custom ordering of times
    times = [v["elapsed_wall_time"] for v in alg.history_]
    assert (np.diff(times) >= 0).all()
    # Make sure results are ordered by partial fit calls for each model
    for model_hist in alg.model_history_.values():
        calls = [h["partial_fit_calls"] for h in model_hist]
        assert (np.diff(calls) >= 1).all() or len(calls) == 1
コード例 #11
0
def test_logs_dont_repeat(c, s, a, b):
    # This test is necessary to make sure the dask_ml.model_selection logger
    # isn't piped to stdout repeatedly.
    #
    # I developed this test to protect against this case:
    # getLogger("dask_ml.model_selection") is piped to stdout whenever a
    # bracket of Hyperband starts/each time SHA._fit is called
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    search = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    with captured_logger(logging.getLogger("dask_ml.model_selection")) as logs:
        yield search.fit(X, y)
        assert search.best_score_ > 0  # ensure search ran
        messages = logs.getvalue().splitlines()
    model_creation_msgs = [m for m in messages if "creating" in m]
    n_models = [m.split(" ")[-2] for m in model_creation_msgs]

    bracket_models = [b["n_models"] for b in search.metadata["brackets"]]
    assert len(bracket_models) == len(set(bracket_models))

    # Make sure only one model creation message is printed per bracket
    # (all brackets have unique n_models as asserted above)
    assert len(n_models) == len(set(n_models))
コード例 #12
0
def test_hyperband_patience(c, s, a, b):
    # Test to make sure that specifying patience=True results in less
    # computation
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    max_iter = 27

    alg = HyperbandSearchCV(model,
                            params,
                            max_iter=max_iter,
                            patience=True,
                            random_state=0)
    yield alg.fit(X, y)

    alg_patience = max_iter // alg.aggressiveness
    actual_decisions = [b.pop("decisions") for b in alg.metadata_["brackets"]]
    paper_decisions = [b.pop("decisions") for b in alg.metadata["brackets"]]

    for paper_iter, actual_iter in zip(paper_decisions, actual_decisions):
        trimmed_paper_iter = {k for k in paper_iter if k <= alg_patience}

        # This makes sure that the algorithm is executed faithfully when
        # patience=True (and the proper decision points are preserved even if
        # other stop-on-plateau points are added)
        assert trimmed_paper_iter.issubset(set(actual_iter))

        # This makes sure models aren't trained for too long
        assert all(x <= alg_patience + 1 for x in actual_iter)

    assert alg.metadata_["partial_fit_calls"] <= alg.metadata[
        "partial_fit_calls"]
    assert alg.best_score_ >= 0.9

    alg = HyperbandSearchCV(model, params, max_iter=max_iter, patience=1)
    with pytest.warns(UserWarning, match="The goal of `patience`"):
        yield alg.fit(X, y)
コード例 #13
0
def test_random_state_no_seed_different_params():
    # Guarantees that specifying a random state relies in the successive
    # halving's having the same random state (and likewise for no random state)

    # This test is required because Hyperband wraps SHAs and the random state
    # needs to be passed correctly.
    values = scipy.stats.uniform(0, 1)
    max_iter = 9
    brackets = _get_hyperband_params(max_iter)
    kwargs = {"value": values}

    h1 = HyperbandSearchCV(ConstantFunction(), kwargs, max_iter=max_iter)
    h2 = HyperbandSearchCV(ConstantFunction(), kwargs, max_iter=max_iter)
    h1._get_SHAs(brackets)
    h2._get_SHAs(brackets)
    assert h1._SHA_seed != h2._SHA_seed

    h1 = HyperbandSearchCV(ConstantFunction(),
                           kwargs,
                           max_iter=9,
                           random_state=0)
    h2 = HyperbandSearchCV(ConstantFunction(),
                           kwargs,
                           max_iter=9,
                           random_state=0)
    h1._get_SHAs(brackets)
    h2._get_SHAs(brackets)
    assert h1._SHA_seed == h2._SHA_seed
コード例 #14
0
    def _test_basic(c, s, a, b):
        rng = da.random.RandomState(42)

        n, d = (50, 2)
        # create observations we know linear models can fit
        X = rng.normal(size=(n, d), chunks=n // 2)
        coef_star = rng.uniform(size=d, chunks=d)
        y = da.sign(X.dot(coef_star))

        if array_type == "numpy":
            X, y = yield c.compute((X, y))

        params = {
            "loss":
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            "average": [True, False],
            "learning_rate": ["constant", "invscaling", "optimal"],
            "eta0":
            np.logspace(-2, 0, num=1000),
        }
        model = SGDClassifier(tol=-np.inf,
                              penalty="elasticnet",
                              random_state=42,
                              eta0=0.1)
        if library == "dask-ml":
            model = Incremental(model)
            params = {"estimator__" + k: v for k, v in params.items()}
        elif library == "ConstantFunction":
            model = ConstantFunction()
            params = {"value": np.linspace(0, 1, num=1000)}

        search = HyperbandSearchCV(model,
                                   params,
                                   max_iter=max_iter,
                                   random_state=42)
        classes = c.compute(da.unique(y))
        yield search.fit(X, y, classes=classes)

        if library == "dask-ml":
            X, y = yield c.compute((X, y))
        score = search.best_estimator_.score(X, y)
        assert score == search.score(X, y)
        assert 0 <= score <= 1

        if library == "ConstantFunction":
            assert score == search.best_score_
        else:
            # These are not equal because IncrementalSearchCV uses a train/test
            # split and we're testing on the entire train dataset, not only the
            # validation/test set.
            assert abs(score - search.best_score_) < 0.1

        assert type(search.best_estimator_) == type(model)
        assert isinstance(search.best_params_, dict)

        num_fit_models = len(set(search.cv_results_["model_id"]))
        num_pf_calls = sum([
            v[-1]["partial_fit_calls"] for v in search.model_history_.values()
        ])
        models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143}
        pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581}
        assert num_fit_models == models[max_iter]
        assert num_pf_calls == pf_calls[max_iter]

        best_idx = search.best_index_
        if isinstance(model, ConstantFunction):
            assert search.cv_results_["test_score"][best_idx] == max(
                search.cv_results_["test_score"])
        model_ids = {h["model_id"] for h in search.history_}

        if math.log(max_iter, 3) % 1.0 == 0:
            # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a
            # power of search.aggressiveness
            # In this case, assert that more models are tried then the max_iter
            assert len(model_ids) > max_iter
        else:
            # Otherwise, give some padding "almost as many estimators are tried
            # as max_iter". 3 is a fudge number chosen to be the minimum; when
            # max_iter=20, len(model_ids) == 17.
            assert len(model_ids) + 3 >= max_iter

        assert all("bracket" in id_ for id_ in model_ids)
コード例 #15
0
n_examples = 15 * len(X_train)
n_params = 15

max_iter = n_params  # number of times partial_fit will be called
chunks = n_examples // n_params  # number of examples each call sees

print(f"max_iter:{max_iter},chunks:{chunks} ")


# In[12]:


ml_hyperband = HyperbandSearchCV(
    estimator=Hb_model,
    parameters=params,
    max_iter=max_iter,
    patience=True,
    random_state=101
    
)

ml_hyperband.metadata["partial_fit_calls"]


# In[13]:


print (" hyperband search")
ml_hyperband


# In[14]:
コード例 #16
0
params = {
    "module__activation": [
        "relu",
        "elu",
    ],
    "batch_size": [32, 64],
    "optimizer__lr": loguniform(1e-4, 1e-3),
    "optimizer__weight_decay": loguniform(1e-6, 1e-3),
    "optimizer__momentum": uniform(0, 1),
    "optimizer__nesterov": [True],
}

from dask_ml.model_selection import HyperbandSearchCV
search = HyperbandSearchCV(model,
                           params,
                           random_state=2,
                           verbose=True,
                           max_iter=9)

y_train2 = y_train.reshape(-1, 1).persist()
search.fit(X_train, y_train2)

print(search.best_score_)

print(search.best_params_)

print(search.best_estimator_)

from dask_ml.wrappers import ParallelPostFit
deployed_model = ParallelPostFit(search.best_estimator_)
deployed_model.score(X_test, y_test)