def test_explicit(c, s, a, b): X, y = make_classification(n_samples=1000, n_features=10, chunks=(200, 10)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = [{"alpha": 0.1}, {"alpha": 0.2}] def additional_calls(scores): """ Progress through predefined updates, checking along the way """ ts = scores[0][-1]["partial_fit_calls"] ts -= 1 # partial_fit_calls = time step + 1 if ts == 0: assert len(scores) == len(params) assert len(scores[0]) == 1 assert len(scores[1]) == 1 return {k: 2 for k in scores} if ts == 2: assert len(scores) == len(params) assert len(scores[0]) == 2 assert len(scores[1]) == 2 return {0: 1, 1: 0} elif ts == 3: assert len(scores) == len(params) assert len(scores[0]) == 3 assert len(scores[1]) == 2 return {0: 3} elif ts == 6: assert len(scores) == 1 assert len(scores[0]) == 4 return {0: 0} else: raise Exception() info, models, history = yield fit( model, params, X, y, X.blocks[-1], y.blocks[-1], additional_calls, scorer=None, fit_params={"classes": [0, 1]}, ) assert all(model.done() for model in models.values()) models = yield models model = models[0] meta = info[0][-1] assert meta["params"] == {"alpha": 0.1} assert meta["partial_fit_calls"] == 6 + 1 assert len(models) == len(info) == 1 assert meta["partial_fit_calls"] == history[-1]["partial_fit_calls"] assert set(models.keys()) == {0} del models[0] while s.tasks or c.futures: # all data clears out yield gen.sleep(0.01)
def test_basic(c, s, a, b): X, y = make_classification(n_samples=1000, n_features=5, chunks=100) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": np.logspace(-2, 1, num=50), "l1_ratio": [0.01, 1.0]} X_test, y_test = X[:100], y[:100] X_train = X[100:] y_train = y[100:] n_parameters = 5 param_list = list(ParameterSampler(params, n_parameters)) def additional_calls(info): pf_calls = {k: v[-1]["partial_fit_calls"] for k, v in info.items()} ret = {k: int(calls < 10) for k, calls in pf_calls.items()} if len(ret) == 1: return {list(ret)[0]: 0} # Don't train one model some_keys = set(ret.keys()) - {0} del ret[random.choice(list(some_keys))] return ret info, models, history, best = yield fit( model, param_list, X_train, y_train, X_test, y_test, additional_calls, fit_params={"classes": [0, 1]}, ) # Ensure that we touched all data keys = {t[0] for t in s.transition_log} L = [str(k) in keys for kk in X_train.__dask_keys__() for k in kk] assert all(L) for model in models.values(): assert isinstance(model, Future) model2 = yield model assert isinstance(model2, SGDClassifier) XX_test, yy_test = yield c.compute([X_test, y_test]) model = yield models[0] assert model.score(XX_test, yy_test) == info[0][-1]["score"] # `<` not `==` because we randomly dropped one model assert len(history) < n_parameters * 10 for h in history: assert { "partial_fit_time", "score_time", "score", "model_id", "params", "partial_fit_calls", }.issubset(set(h.keys())) groups = toolz.groupby("partial_fit_calls", history) assert len(groups[1]) > len(groups[2]) > len(groups[3]) > len( groups[max(groups)]) assert max(groups) == n_parameters keys = list(models.keys()) for key in keys: del models[key] while c.futures or s.tasks: # Cleans up cleanly after running yield gen.sleep(0.01) # smoke test for ndarray X_test and y_test X_test, y_test = yield c.compute([X_test, y_test]) info, models, history, best = yield fit( model, param_list, X_train, y_train, X_test, y_test, additional_calls, fit_params={"classes": [0, 1]}, )