Beispiel #1
0
def test_explicit(c, s, a, b):
    X, y = make_classification(n_samples=1000, n_features=10, chunks=(200, 10))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = [{"alpha": 0.1}, {"alpha": 0.2}]

    def additional_calls(scores):
        """ Progress through predefined updates, checking along the way """
        ts = scores[0][-1]["partial_fit_calls"]
        ts -= 1  # partial_fit_calls = time step + 1
        if ts == 0:
            assert len(scores) == len(params)
            assert len(scores[0]) == 1
            assert len(scores[1]) == 1
            return {k: 2 for k in scores}
        if ts == 2:
            assert len(scores) == len(params)
            assert len(scores[0]) == 2
            assert len(scores[1]) == 2
            return {0: 1, 1: 0}
        elif ts == 3:
            assert len(scores) == len(params)
            assert len(scores[0]) == 3
            assert len(scores[1]) == 2
            return {0: 3}
        elif ts == 6:
            assert len(scores) == 1
            assert len(scores[0]) == 4
            return {0: 0}
        else:
            raise Exception()

    info, models, history = yield fit(
        model,
        params,
        X,
        y,
        X.blocks[-1],
        y.blocks[-1],
        additional_calls,
        scorer=None,
        fit_params={"classes": [0, 1]},
    )
    assert all(model.done() for model in models.values())

    models = yield models
    model = models[0]
    meta = info[0][-1]

    assert meta["params"] == {"alpha": 0.1}
    assert meta["partial_fit_calls"] == 6 + 1
    assert len(models) == len(info) == 1
    assert meta["partial_fit_calls"] == history[-1]["partial_fit_calls"]
    assert set(models.keys()) == {0}
    del models[0]

    while s.tasks or c.futures:  # all data clears out
        yield gen.sleep(0.01)
Beispiel #2
0
def test_basic(c, s, a, b):
    X, y = make_classification(n_samples=1000, n_features=5, chunks=100)
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")

    params = {"alpha": np.logspace(-2, 1, num=50), "l1_ratio": [0.01, 1.0]}

    X_test, y_test = X[:100], y[:100]
    X_train = X[100:]
    y_train = y[100:]

    n_parameters = 5
    param_list = list(ParameterSampler(params, n_parameters))

    def additional_calls(info):
        pf_calls = {k: v[-1]["partial_fit_calls"] for k, v in info.items()}
        ret = {k: int(calls < 10) for k, calls in pf_calls.items()}
        if len(ret) == 1:
            return {list(ret)[0]: 0}

        # Don't train one model
        some_keys = set(ret.keys()) - {0}
        del ret[random.choice(list(some_keys))]
        return ret

    info, models, history, best = yield fit(
        model,
        param_list,
        X_train,
        y_train,
        X_test,
        y_test,
        additional_calls,
        fit_params={"classes": [0, 1]},
    )

    # Ensure that we touched all data
    keys = {t[0] for t in s.transition_log}
    L = [str(k) in keys for kk in X_train.__dask_keys__() for k in kk]
    assert all(L)

    for model in models.values():
        assert isinstance(model, Future)
        model2 = yield model
        assert isinstance(model2, SGDClassifier)
    XX_test, yy_test = yield c.compute([X_test, y_test])
    model = yield models[0]
    assert model.score(XX_test, yy_test) == info[0][-1]["score"]

    # `<` not `==` because we randomly dropped one model
    assert len(history) < n_parameters * 10
    for h in history:
        assert {
            "partial_fit_time",
            "score_time",
            "score",
            "model_id",
            "params",
            "partial_fit_calls",
        }.issubset(set(h.keys()))

    groups = toolz.groupby("partial_fit_calls", history)
    assert len(groups[1]) > len(groups[2]) > len(groups[3]) > len(
        groups[max(groups)])
    assert max(groups) == n_parameters

    keys = list(models.keys())
    for key in keys:
        del models[key]

    while c.futures or s.tasks:  # Cleans up cleanly after running
        yield gen.sleep(0.01)

    # smoke test for ndarray X_test and y_test
    X_test, y_test = yield c.compute([X_test, y_test])
    info, models, history, best = yield fit(
        model,
        param_list,
        X_train,
        y_train,
        X_test,
        y_test,
        additional_calls,
        fit_params={"classes": [0, 1]},
    )