コード例 #1
0
ファイル: test_incremental.py プロジェクト: tanayag/dask-ml
def test_search_max_iter(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)}

    search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=1)
    yield search.fit(X, y, classes=[0, 1])
    for d in search.history_:
        assert d["partial_fit_calls"] <= 1
コード例 #2
0
ファイル: test_partial.py プロジェクト: andrethrill/dask-ml
def test_fit_rechunking():
    n_classes = 2
    X, y = make_classification(chunks=20, n_classes=n_classes)
    X = X.rechunk({1: 10})

    assert X.numblocks[1] > 1

    clf = Incremental(SGDClassifier(max_iter=5))
    clf.fit(X, y, classes=list(range(n_classes)))
コード例 #3
0
def test_warns_scores_per_fit(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(model, params, scores_per_fit=2)
    with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"):
        yield search.fit(X, y)
コード例 #4
0
def test_no_method_raises():
    clf = ParallelPostFit(LinearRegression())
    X, y = make_classification(chunks=50)
    clf.fit(X, y)

    with pytest.raises(AttributeError) as m:
        clf.predict_proba(X)

    assert m.match("The wrapped estimator .* 'predict_proba' method.")
コード例 #5
0
def test_smaller(c, s, a, b):
    # infininte loop
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {"alpha": [0.1, 0.5]}
    search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
    yield search.fit(X, y, classes=[0, 1])
    (X_, ) = yield c.compute([X])
    search.predict(X_)
コード例 #6
0
def test_transform(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    model = MiniBatchKMeans(random_state=0)
    params = {"n_clusters": [3, 4, 5], "n_init": [1, 2]}
    search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
    yield search.fit(X, y)
    (X_, ) = yield c.compute([X])
    result = search.transform(X_)
    assert result.shape == (100, search.best_estimator_.n_clusters)
コード例 #7
0
def test_big(fit_intercept):
    X, y = make_classification(chunks=50)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.decision_function(X)
    lr.predict(X)
    lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
コード例 #8
0
ファイル: test_hyperband.py プロジェクト: stsievert/dask-ml
def test_min_max_iter(c, s, a, b):
    # This test makes sure Hyperband works with max_iter=1.
    # Tests for max_iter < 1 are in test_incremental.py.
    values = scipy.stats.uniform(0, 1)
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)

    max_iter = 1
    h = HyperbandSearchCV(ConstantFunction(), {"value": values}, max_iter=max_iter)
    yield h.fit(X, y)
    assert h.best_score_ > 0
コード例 #9
0
ファイル: test_incremental.py プロジェクト: cjnolet/dask-ml
def test_search_plateau_tol(c, s, a, b):
    model = LinearFunction(slope=1)
    params = {"foo": np.linspace(0, 1)}

    # every 3 calls, score will increase by 3. tol=1: model did improved enough
    search = IncrementalSearchCV(
        model, params, patience=3, tol=1, max_iter=10, decay_rate=0
    )
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {10}

    # Every 3 calls, score increases by 3. tol=4: model didn't improve enough
    search = IncrementalSearchCV(
        model, params, patience=3, tol=4, decay_rate=0, max_iter=10
    )
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {3}
コード例 #10
0
 def _test_verbosity(c, s, a, b):
     X, y = make_classification(n_samples=10, n_features=4, chunks=10)
     model = ConstantFunction()
     params = {"value": scipy.stats.uniform(0, 1)}
     search = IncrementalSearchCV(model,
                                  params,
                                  max_iter=max_iter,
                                  verbose=verbose)
     yield search.fit(X, y)
     return search
コード例 #11
0
def single_chunk_count_classification():
    """X, y pair for classification.

    The `X` and `y` have a single block, so chunksize is 100.
    Useful for testing `partial_fit` methods. The `X` data
    is count data
    """
    X, y = make_classification(chunks=100, random_state=0)
    X = (abs(X) * 10).astype(int)
    return X, y
コード例 #12
0
def Xl_blobs():
    """
    Tuple of (X, labels) for a classification task. `X`
    and `l` are both dask arrays
    """
    X, l = make_classification(n_samples=1000,
                               n_features=4,
                               chunks=500,
                               random_state=1)
    return X, l
コード例 #13
0
ファイル: test_glm.py プロジェクト: jborchma/dask-ml
def test_fit_solver(solver):
    import dask_glm
    from distutils.version import LooseVersion

    if LooseVersion(dask_glm.__version__) <= "0.2.0":
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
コード例 #14
0
def test_search_plateau_tol(c, s, a, b):
    class LinearFunction(BaseEstimator):
        def __init__(self, intercept=0, slope=1, foo=0):
            self._num_calls = 0
            self.intercept = intercept
            self.slope = slope
            super(LinearFunction, self).__init__()

        def fit(self, *args):
            return self

        def partial_fit(self, *args, **kwargs):
            self._num_calls += 1
            return self

        def score(self, *args, **kwargs):
            return self.intercept + self.slope * self._num_calls

    model = LinearFunction(slope=1)
    params = {"foo": np.linspace(0, 1)}

    # every 3 calls, score will increase by 3. tol=1: model did improved enough
    search = IncrementalSearchCV(model,
                                 params,
                                 patience=3,
                                 tol=1,
                                 max_iter=10,
                                 decay_rate=0)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {10}

    # Every 3 calls, score increases by 3. tol=4: model didn't improve enough
    search = IncrementalSearchCV(model,
                                 params,
                                 patience=3,
                                 tol=4,
                                 decay_rate=0,
                                 max_iter=10)
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    yield search.fit(X, y)
    assert set(search.cv_results_["partial_fit_calls"]) == {3}
コード例 #15
0
def test_numpy_array(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    X, y = yield c.compute([X, y])
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {
        "alpha": np.logspace(-2, 10, 10),
        "l1_ratio": np.linspace(0.01, 1, 20)
    }

    search = IncrementalSearchCV(model, params, n_initial_parameters=10)
    yield search.fit(X, y, classes=[0, 1])
コード例 #16
0
ファイル: test_glm.py プロジェクト: mmccarty/dask-ml
def test_fit_solver(solver):
    import dask_glm
    import packaging.version

    if packaging.version.parse(
            dask_glm.__version__) <= packaging.version.parse("0.2.0"):
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
コード例 #17
0
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    clf.fit(X, y)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)

    result = clf.score(X, y)
    expected = clf.estimator.score(X, y)
    assert result == expected
コード例 #18
0
def simple_example():
	X, y = make_classification(n_samples=10000, n_features=2, chunks=50)

	X = dd.from_dask_array(X, columns=["a","b"])
	y = dd.from_array(y)

	lr = LogisticRegression()
	lr.fit(X.values, y.values)

	print('Predictions =', lr.predict(X.values).compute())
	print('Probabilities =', lr.predict_proba(X.values).compute())
	print('Scores =', lr.score(X.values, y.values).compute())
コード例 #19
0
    def test_fit(self):
        a = dpp.RobustScaler()
        b = spp.RobustScaler()

        # bigger data to make percentile more reliable
        # and not centered around 0 to make rtol work
        X, y = make_classification(n_samples=1000, chunks=200, random_state=0)
        X = X + 3

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b, rtol=0.2)
コード例 #20
0
ファイル: test_incremental.py プロジェクト: Cdebus/dask-ml
async def test_model_future(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()
    model_future = await c.scatter(model)

    search = IncrementalSearchCV(model_future, params, max_iter=10)

    await search.fit(X, y, classes=[0, 1])
    assert search.history_
    assert search.best_score_ > 0
コード例 #21
0
def test_hyperband_patience(c, s, a, b):
    # Test to make sure that specifying patience=True results in less
    # computation
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    max_iter = 27

    alg = HyperbandSearchCV(model,
                            params,
                            max_iter=max_iter,
                            patience=True,
                            random_state=0)
    yield alg.fit(X, y)

    alg_patience = max_iter // alg.aggressiveness
    actual_decisions = [b.pop("decisions") for b in alg.metadata_["brackets"]]
    paper_decisions = [b.pop("decisions") for b in alg.metadata["brackets"]]

    for paper_iter, actual_iter in zip(paper_decisions, actual_decisions):
        trimmed_paper_iter = {k for k in paper_iter if k <= alg_patience}

        # This makes sure that the algorithm is executed faithfully when
        # patience=True (and the proper decision points are preserved even if
        # other stop-on-plateau points are added)
        assert trimmed_paper_iter.issubset(set(actual_iter))

        # This makes sure models aren't trained for too long
        assert all(x <= alg_patience + 1 for x in actual_iter)

    assert alg.metadata_["partial_fit_calls"] <= alg.metadata[
        "partial_fit_calls"]
    assert alg.best_score_ >= 0.9

    max_iter = 6
    kwargs = dict(max_iter=max_iter, aggressiveness=2)
    alg = HyperbandSearchCV(model, params, patience=2, **kwargs)
    with pytest.warns(UserWarning, match="The goal of `patience`"):
        yield alg.fit(X, y)

    alg = HyperbandSearchCV(model, params, patience=2, tol=np.nan, **kwargs)
    yield alg.fit(X, y)
    assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter

    alg = HyperbandSearchCV(model, params, patience=2, tol=None, **kwargs)
    yield alg.fit(X, y)
    assert pd.DataFrame(alg.history_).partial_fit_calls.max() == max_iter

    alg = HyperbandSearchCV(model, params, patience=1, **kwargs)
    with pytest.raises(ValueError, match="always detect a plateau"):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            yield alg.fit(X, y)
コード例 #22
0
def test_cv_results_order_preserved(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}
    alg = HyperbandSearchCV(model, params, max_iter=9, random_state=42)
    yield alg.fit(X, y)

    info = {k: v[-1] for k, v in alg.model_history_.items()}
    for _, row in pd.DataFrame(alg.cv_results_).iterrows():
        model_info = info[row["model_id"]]
        assert row["bracket"] == model_info["bracket"]
        assert row["params"] == model_info["params"]
        assert np.allclose(row["test_score"], model_info["score"])
コード例 #23
0
def test_predict_correct_output_dtype():
    X, y = make_classification(chunks=100)
    X_ddf = dd.from_dask_array(X)

    base = LinearRegression(n_jobs=1)
    base.fit(X, y)

    wrap = ParallelPostFit(base)

    base_output = base.predict(X_ddf.compute())
    wrap_output = wrap.predict(X_ddf)

    assert wrap_output.dtype == base_output.dtype
コード例 #24
0
ファイル: test_incremental.py プロジェクト: Cdebus/dask-ml
def test_verbosity_types(c, s, a, b):
    X, y = make_classification(n_samples=10, n_features=4, chunks=10)
    model = ConstantFunction()
    params = {"value": scipy.stats.uniform(0, 1)}

    for verbose in [-1.0, 1.2]:
        search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3)
        with pytest.raises(ValueError, match="0 <= verbose <= 1"):
            yield search.fit(X, y)

    for verbose in [0.0, 0, 1, 1.0, True, False]:
        search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3)
        yield search.fit(X, y)
コード例 #25
0
ファイル: test_incremental.py プロジェクト: Cdebus/dask-ml
    def test_gridsearch_func(c, s, a, b):
        X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

        model = SGDClassifier(tol=1e-3)

        params = {"alpha": np.logspace(-2, 10, 3), "l1_ratio": np.linspace(0.01, 1, 2)}

        search = IncrementalSearchCV(model, params, n_initial_parameters="grid")
        yield search.fit(X, y, classes=[0, 1])

        assert {frozenset(d["params"].items()) for d in search.history_} == {
            frozenset(d.items()) for d in ParameterGrid(params)
        }
コード例 #26
0
ファイル: test_incremental.py プロジェクト: Cdebus/dask-ml
def test_numpy_array(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))
    X, y = yield c.compute([X, y])
    model = SGDClassifier(tol=1e-3, penalty="elasticnet")
    params = {
        "alpha": np.logspace(-5, -3, 10),
        "l1_ratio": np.linspace(0, 1, 20),
    }

    search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=10)
    yield search.fit(X, y, classes=[0, 1])

    # smoke test to ensure search completed successfully
    assert search.best_score_ > 0
コード例 #27
0
async def test_warns_decay_rate(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    kwargs = dict(max_iter=5, n_initial_parameters=5)
    search = IncrementalSearchCV(model, params, **kwargs)
    match = r"deprecated since Dask-ML v1.4.0."
    with pytest.warns(FutureWarning, match=match):
        await search.fit(X, y)

    # Make sure the printed warning message works
    search = IncrementalSearchCV(model, params, decay_rate=None, **kwargs)
    await search.fit(X, y)
コード例 #28
0
def test_warns_decay_rate_wanted(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=10)

    params = {"value": np.random.RandomState(42).rand(1000)}
    model = ConstantFunction()

    search = IncrementalSearchCV(
        model, params, max_iter=5, n_initial_parameters=5, decay_rate=1
    )
    match = "decay_rate is deprecated .* Use InverseDecaySearchCV"
    with pytest.warns(FutureWarning, match=match):
        yield search.fit(X, y)

    # Make sure old behavior is retained w/o warning
    search = InverseDecaySearchCV(model, params, decay_rate=1)
    yield search.fit(X, y)
コード例 #29
0
def test_multiclass():
    X, y = make_classification(chunks=50, n_classes=3, n_informative=4)
    clf = ParallelPostFit(LogisticRegression(random_state=0))

    clf.fit(X, y)
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)
コード例 #30
0
ファイル: test_incremental.py プロジェクト: Cdebus/dask-ml
def test_search_patience_infeasible_tol(c, s, a, b):
    X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5))

    rng = check_random_state(42)
    params = {"value": rng.rand(1000)}
    model = ConstantFunction()

    max_iter = 10
    score_increase = -10
    search = IncrementalSearchCV(
        model, params, max_iter=max_iter, patience=3, tol=score_increase,
    )
    yield search.fit(X, y, classes=[0, 1])

    hist = pd.DataFrame(search.history_)
    assert hist.partial_fit_calls.max() == max_iter