def test_search_max_iter(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)} search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=1) yield search.fit(X, y, classes=[0, 1]) for d in search.history_: assert d["partial_fit_calls"] <= 1
def test_fit_rechunking(): n_classes = 2 X, y = make_classification(chunks=20, n_classes=n_classes) X = X.rechunk({1: 10}) assert X.numblocks[1] > 1 clf = Incremental(SGDClassifier(max_iter=5)) clf.fit(X, y, classes=list(range(n_classes)))
def test_warns_scores_per_fit(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV(model, params, scores_per_fit=2) with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"): yield search.fit(X, y)
def test_no_method_raises(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) with pytest.raises(AttributeError) as m: clf.predict_proba(X) assert m.match("The wrapped estimator .* 'predict_proba' method.")
def test_smaller(c, s, a, b): # infininte loop X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": [0.1, 0.5]} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y, classes=[0, 1]) (X_, ) = yield c.compute([X]) search.predict(X_)
def test_transform(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = MiniBatchKMeans(random_state=0) params = {"n_clusters": [3, 4, 5], "n_init": [1, 2]} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y) (X_, ) = yield c.compute([X]) result = search.transform(X_) assert result.shape == (100, search.best_estimator_.n_clusters)
def test_big(fit_intercept): X, y = make_classification(chunks=50) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.decision_function(X) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_min_max_iter(c, s, a, b): # This test makes sure Hyperband works with max_iter=1. # Tests for max_iter < 1 are in test_incremental.py. values = scipy.stats.uniform(0, 1) X, y = make_classification(n_samples=10, n_features=4, chunks=10) max_iter = 1 h = HyperbandSearchCV(ConstantFunction(), {"value": values}, max_iter=max_iter) yield h.fit(X, y) assert h.best_score_ > 0
def test_search_plateau_tol(c, s, a, b): model = LinearFunction(slope=1) params = {"foo": np.linspace(0, 1)} # every 3 calls, score will increase by 3. tol=1: model did improved enough search = IncrementalSearchCV( model, params, patience=3, tol=1, max_iter=10, decay_rate=0 ) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {10} # Every 3 calls, score increases by 3. tol=4: model didn't improve enough search = IncrementalSearchCV( model, params, patience=3, tol=4, decay_rate=0, max_iter=10 ) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {3}
def _test_verbosity(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} search = IncrementalSearchCV(model, params, max_iter=max_iter, verbose=verbose) yield search.fit(X, y) return search
def single_chunk_count_classification(): """X, y pair for classification. The `X` and `y` have a single block, so chunksize is 100. Useful for testing `partial_fit` methods. The `X` data is count data """ X, y = make_classification(chunks=100, random_state=0) X = (abs(X) * 10).astype(int) return X, y
def Xl_blobs(): """ Tuple of (X, labels) for a classification task. `X` and `l` are both dask arrays """ X, l = make_classification(n_samples=1000, n_features=4, chunks=500, random_state=1) return X, l
def test_fit_solver(solver): import dask_glm from distutils.version import LooseVersion if LooseVersion(dask_glm.__version__) <= "0.2.0": pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def test_search_plateau_tol(c, s, a, b): class LinearFunction(BaseEstimator): def __init__(self, intercept=0, slope=1, foo=0): self._num_calls = 0 self.intercept = intercept self.slope = slope super(LinearFunction, self).__init__() def fit(self, *args): return self def partial_fit(self, *args, **kwargs): self._num_calls += 1 return self def score(self, *args, **kwargs): return self.intercept + self.slope * self._num_calls model = LinearFunction(slope=1) params = {"foo": np.linspace(0, 1)} # every 3 calls, score will increase by 3. tol=1: model did improved enough search = IncrementalSearchCV(model, params, patience=3, tol=1, max_iter=10, decay_rate=0) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {10} # Every 3 calls, score increases by 3. tol=4: model didn't improve enough search = IncrementalSearchCV(model, params, patience=3, tol=4, decay_rate=0, max_iter=10) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {3}
def test_numpy_array(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) X, y = yield c.compute([X, y]) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = { "alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20) } search = IncrementalSearchCV(model, params, n_initial_parameters=10) yield search.fit(X, y, classes=[0, 1])
def test_fit_solver(solver): import dask_glm import packaging.version if packaging.version.parse( dask_glm.__version__) <= packaging.version.parse("0.2.0"): pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) clf.fit(X, y) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array) result = clf.score(X, y) expected = clf.estimator.score(X, y) assert result == expected
def simple_example(): X, y = make_classification(n_samples=10000, n_features=2, chunks=50) X = dd.from_dask_array(X, columns=["a","b"]) y = dd.from_array(y) lr = LogisticRegression() lr.fit(X.values, y.values) print('Predictions =', lr.predict(X.values).compute()) print('Probabilities =', lr.predict_proba(X.values).compute()) print('Scores =', lr.score(X.values, y.values).compute())
def test_fit(self): a = dpp.RobustScaler() b = spp.RobustScaler() # bigger data to make percentile more reliable # and not centered around 0 to make rtol work X, y = make_classification(n_samples=1000, chunks=200, random_state=0) X = X + 3 a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b, rtol=0.2)
async def test_model_future(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() model_future = await c.scatter(model) search = IncrementalSearchCV(model_future, params, max_iter=10) await search.fit(X, y, classes=[0, 1]) assert search.history_ assert search.best_score_ > 0
def test_hyperband_patience(c, s, a, b): # Test to make sure that specifying patience=True results in less # computation X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} max_iter = 27 alg = HyperbandSearchCV(model, params, max_iter=max_iter, patience=True, random_state=0) yield alg.fit(X, y) alg_patience = max_iter // alg.aggressiveness actual_decisions = [b.pop("decisions") for b in alg.metadata_["brackets"]] paper_decisions = [b.pop("decisions") for b in alg.metadata["brackets"]] for paper_iter, actual_iter in zip(paper_decisions, actual_decisions): trimmed_paper_iter = {k for k in paper_iter if k <= alg_patience} # This makes sure that the algorithm is executed faithfully when # patience=True (and the proper decision points are preserved even if # other stop-on-plateau points are added) assert trimmed_paper_iter.issubset(set(actual_iter)) # This makes sure models aren't trained for too long assert all(x <= alg_patience + 1 for x in actual_iter) assert alg.metadata_["partial_fit_calls"] <= alg.metadata[ "partial_fit_calls"] assert alg.best_score_ >= 0.9 max_iter = 6 kwargs = dict(max_iter=max_iter, aggressiveness=2) alg = HyperbandSearchCV(model, params, patience=2, **kwargs) with pytest.warns(UserWarning, match="The goal of `patience`"): yield alg.fit(X, y) alg = HyperbandSearchCV(model, params, patience=2, tol=np.nan, **kwargs) yield alg.fit(X, y) assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter alg = HyperbandSearchCV(model, params, patience=2, tol=None, **kwargs) yield alg.fit(X, y) assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter alg = HyperbandSearchCV(model, params, patience=1, **kwargs) with pytest.raises(ValueError, match="always detect a plateau"): with warnings.catch_warnings(): warnings.simplefilter("ignore") yield alg.fit(X, y)
def test_cv_results_order_preserved(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42) yield alg.fit(X, y) info = {k: v[-1] for k, v in alg.model_history_.items()} for _, row in pd.DataFrame(alg.cv_results_).iterrows(): model_info = info[row["model_id"]] assert row["bracket"] == model_info["bracket"] assert row["params"] == model_info["params"] assert np.allclose(row["test_score"], model_info["score"])
def test_predict_correct_output_dtype(): X, y = make_classification(chunks=100) X_ddf = dd.from_dask_array(X) base = LinearRegression(n_jobs=1) base.fit(X, y) wrap = ParallelPostFit(base) base_output = base.predict(X_ddf.compute()) wrap_output = wrap.predict(X_ddf) assert wrap_output.dtype == base_output.dtype
def test_verbosity_types(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} for verbose in [-1.0, 1.2]: search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3) with pytest.raises(ValueError, match="0 <= verbose <= 1"): yield search.fit(X, y) for verbose in [0.0, 0, 1, 1.0, True, False]: search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3) yield search.fit(X, y)
def test_gridsearch_func(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3) params = {"alpha": np.logspace(-2, 10, 3), "l1_ratio": np.linspace(0.01, 1, 2)} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y, classes=[0, 1]) assert {frozenset(d["params"].items()) for d in search.history_} == { frozenset(d.items()) for d in ParameterGrid(params) }
def test_numpy_array(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) X, y = yield c.compute([X, y]) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = { "alpha": np.logspace(-5, -3, 10), "l1_ratio": np.linspace(0, 1, 20), } search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=10) yield search.fit(X, y, classes=[0, 1]) # smoke test to ensure search completed successfully assert search.best_score_ > 0
async def test_warns_decay_rate(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() kwargs = dict(max_iter=5, n_initial_parameters=5) search = IncrementalSearchCV(model, params, **kwargs) match = r"deprecated since Dask-ML v1.4.0." with pytest.warns(FutureWarning, match=match): await search.fit(X, y) # Make sure the printed warning message works search = IncrementalSearchCV(model, params, decay_rate=None, **kwargs) await search.fit(X, y)
def test_warns_decay_rate_wanted(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV( model, params, max_iter=5, n_initial_parameters=5, decay_rate=1 ) match = "decay_rate is deprecated .* Use InverseDecaySearchCV" with pytest.warns(FutureWarning, match=match): yield search.fit(X, y) # Make sure old behavior is retained w/o warning search = InverseDecaySearchCV(model, params, decay_rate=1) yield search.fit(X, y)
def test_multiclass(): X, y = make_classification(chunks=50, n_classes=3, n_informative=4) clf = ParallelPostFit(LogisticRegression(random_state=0)) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_search_patience_infeasible_tol(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) rng = check_random_state(42) params = {"value": rng.rand(1000)} model = ConstantFunction() max_iter = 10 score_increase = -10 search = IncrementalSearchCV( model, params, max_iter=max_iter, patience=3, tol=score_increase, ) yield search.fit(X, y, classes=[0, 1]) hist = pd.DataFrame(search.history_) assert hist.partial_fit_calls.max() == max_iter