Ejemplo n.º 1
0
def test_grid_search_allows_nans():
    # Test dcv.GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]

    imputer = SimpleImputer(strategy="mean", missing_values=np.nan)
    p = Pipeline([("imputer", imputer), ("classifier", MockClassifier())])
    dcv.GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
def test_grid_search_precomputed_kernel_error_nonsquare():
    # Test that grid search returns an error with a non-square precomputed
    # training kernel matrix
    K_train = np.zeros((10, 20))
    y_train = np.ones((10, ))
    clf = SVC(kernel="precomputed")
    cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]})
    with pytest.raises(ValueError):
        cv.fit(K_train, y_train)
def test_grid_search_failing_classifier():
    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
    clf = FailingClassifier()

    # refit=False because we want to test the behaviour of the grid search part
    gs = dcv.GridSearchCV(clf, [{
        'parameter': [0, 1, 2]
    }],
                          scoring='accuracy',
                          refit=False,
                          error_score=0.0)

    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    n_candidates = len(gs.cv_results_['params'])

    # Ensure that grid scores were set to zero as required for those fits
    # that are expected to fail.
    def get_cand_scores(i):
        return np.array(
            list(gs.cv_results_['split%d_test_score' % s][i]
                 for s in range(gs.n_splits_)))

    assert all((np.all(get_cand_scores(cand_i) == 0.0)
                for cand_i in range(n_candidates)
                if gs.cv_results_['param_parameter'][cand_i] ==
                FailingClassifier.FAILING_PARAMETER))

    gs = dcv.GridSearchCV(clf, [{
        'parameter': [0, 1, 2]
    }],
                          scoring='accuracy',
                          refit=False,
                          error_score=float('nan'))

    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    n_candidates = len(gs.cv_results_['params'])
    assert all(
        np.all(np.isnan(get_cand_scores(cand_i)))
        for cand_i in range(n_candidates) if gs.cv_results_['param_parameter']
        [cand_i] == FailingClassifier.FAILING_PARAMETER)
def test_grid_search_allows_nans():
    # Test dcv.GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
Ejemplo n.º 5
0
def test_search_cv_results_none_param():
    X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
    estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
    est_parameters = {"random_state": [0, None]}
    cv = KFold(random_state=0, n_splits=2, shuffle=True)

    for est in estimators:
        grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv).fit(X, y)
        assert_array_equal(grid_search.cv_results_["param_random_state"],
                           [0, None])
Ejemplo n.º 6
0
def test_scheduler_param(scheduler, n_jobs):
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    gs = dcv.GridSearchCV(
        MockClassifier(),
        {"foo_param": [0, 1, 2]},
        cv=3,
        scheduler=scheduler,
        n_jobs=n_jobs,
    )
    gs.fit(X, y)
Ejemplo n.º 7
0
def test_search_basic(xy_classification):
    X, y = xy_classification
    param_grid = {'class_weight': [None, 'balanced']}

    a = dms.GridSearchCV(SVC(kernel='rbf'), param_grid)
    a.fit(X, y)

    param_dist = {'C': stats.uniform}
    b = dms.RandomizedSearchCV(SVC(kernel='rbf'), param_dist)
    b.fit(X, y)
Ejemplo n.º 8
0
def test_search_basic(xy_classification):
    X, y = xy_classification
    param_grid = {"class_weight": [None, "balanced"]}

    a = dms.GridSearchCV(SVC(kernel="rbf", gamma=0.1), param_grid)
    a.fit(X, y)

    param_dist = {"C": stats.uniform}
    b = dms.RandomizedSearchCV(SVC(kernel="rbf", gamma=0.1), param_dist)
    b.fit(X, y)
Ejemplo n.º 9
0
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert np.mean(y_pred == y_pred2) >= 0.9
    assert C == C2
Ejemplo n.º 10
0
def test_y_as_list():
    # Pass y as list in dcv.GridSearchCV
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
    cv = KFold(n_splits=3)
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv)
    grid_search.fit(X, y.tolist()).score(X, y)
    assert hasattr(grid_search, "cv_results_")
Ejemplo n.º 11
0
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0, n_splits=3, shuffle=True)

    estimators = [
        DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0),
    ]

    scoring = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score,
                                          average="weighted")
    # Test with grid search cv
    for est in estimators:
        grid_search = dcv.GridSearchCV(est,
                                       est_parameters,
                                       cv=cv,
                                       scoring=scoring)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_["params"]
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = scoring(est, X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_["split%d_test_score" % i][cand_i],
                )

    # Test with a randomized search
    for est in estimators:
        random_search = dcv.RandomizedSearchCV(est,
                                               est_parameters,
                                               cv=cv,
                                               n_iter=3,
                                               scoring=scoring)
        random_search.fit(X, y)
        res_params = random_search.cv_results_["params"]
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = scoring(est, X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_["split%d_test_score" %
                                              i][cand_i],
                )
Ejemplo n.º 12
0
def test_trivial_cv_results_attr():
    # Test search over a "grid" with only one point.
    # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV.
    clf = MockClassifier()
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1]})
    grid_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")

    random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1)
    random_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")
def test_search_train_scores_set_to_false():
    X = np.arange(6).reshape(6, -1)
    y = [0, 0, 0, 1, 1, 1]
    clf = LinearSVC(random_state=0)

    gs = dcv.GridSearchCV(clf,
                          param_grid={"C": [0.1, 0.2]},
                          return_train_score=False)
    gs.fit(X, y)
    for key in gs.cv_results_:
        assert not key.endswith("train_score")
Ejemplo n.º 14
0
def test_refit():
    # Regression test for bug in refitting
    # Simulates re-fitting a broken estimator; this used to break with
    # sparse SVMs.
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = dcv.GridSearchCV(
        BrokenClassifier(), [{"parameter": [0, 1]}], scoring="accuracy", refit=True
    )
    clf.fit(X, y)
Ejemplo n.º 15
0
def test_gridsearch_nd():
    # Pass X as list in dcv.GridSearchCV
    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
    clf = CheckingClassifier(
        check_X=lambda x: x.shape[1:] == (5, 3, 2),
        check_y=lambda x: x.shape[1:] == (7, 11),
    )
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]})
    grid_search.fit(X_4d, y_3d).score(X, y)
    assert hasattr(grid_search, "cv_results_")
Ejemplo n.º 16
0
def test_scheduler_param_distributed(loop):  # noqa
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    with cluster() as (s, [a, b]):
        with Client(s["address"], loop=loop) as client:
            gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3)
            gs.fit(X, y)

            def f(dask_scheduler):
                return len(dask_scheduler.transition_log)

            assert client.run_on_scheduler(f)  # some work happened on cluster
Ejemplo n.º 17
0
def test_grid_search_one_grid_point():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

    clf = SVC()
    cv = dcv.GridSearchCV(clf, param_dict)
    cv.fit(X_, y_)

    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
    clf.fit(X_, y_)

    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
Ejemplo n.º 18
0
def test_visualize():
    pytest.importorskip('graphviz')

    X, y = make_classification(n_samples=100,
                               n_classes=2,
                               flip_y=.2,
                               random_state=0)
    clf = SVC(random_state=0)
    grid = {'C': [.1, .5, .9]}
    gs = dcv.GridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, 'dask_graph_')

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()
Ejemplo n.º 19
0
def test_visualize():
    pytest.importorskip("graphviz")

    X, y = make_classification(n_samples=100,
                               n_classes=2,
                               flip_y=0.2,
                               random_state=0)
    clf = SVC(random_state=0, gamma="auto")
    grid = {"C": [0.1, 0.5, 0.9]}
    gs = dcv.GridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, "dask_graph_")

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()
def test_gridsearch_no_predict():
    # test grid-search with an estimator without predict.
    # slight duplication of a test from KDE
    def custom_scoring(estimator, X):
        return 42 if estimator.bandwidth == .1 else 0

    X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
    search = dcv.GridSearchCV(
        KernelDensity(), param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring
    )
    search.fit(X)
    assert search.best_params_["bandwidth"] == .1
    assert search.best_score_ == 42
Ejemplo n.º 21
0
def test_cv_multiplemetrics_requires_refit_metric():
    X, y = make_classification(random_state=0)

    param_grid = {"max_depth": [1, 5]}
    a = dcv.GridSearchCV(
        RandomForestClassifier(n_estimators=10),
        param_grid,
        refit=True,
        scoring={"score1": "accuracy", "score2": "accuracy"},
    )

    with pytest.raises(ValueError):
        a.fit(X, y)
Ejemplo n.º 22
0
def test_cv_multiplemetrics_requires_refit_metric():
    X, y = make_classification(random_state=0)

    param_grid = {'max_depth': [1, 5]}
    a = dcv.GridSearchCV(RandomForestClassifier(),
                         param_grid,
                         refit=True,
                         scoring={
                             'score1': 'accuracy',
                             'score2': 'accuracy'
                         })

    with pytest.raises(ValueError):
        a.fit(X, y)
Ejemplo n.º 23
0
def test_classes__property():
    # Test that classes_ property matches best_estimator_.classes_
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)
    Cs = [0.1, 1, 10]

    grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
    grid_search.fit(X, y)
    assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_)

    # Test that regressors do not have a classes_ attribute
    grid_search = dcv.GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]})
    grid_search.fit(X, y)
    assert not hasattr(grid_search, "classes_")

    # Test that the grid searcher has no classes_ attribute before it's fit
    grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs})
    assert not hasattr(grid_search, "classes_")

    # Test that the grid searcher has no classes_ attribute without a refit
    grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False)
    grid_search.fit(X, y)
    assert not hasattr(grid_search, "classes_")
Ejemplo n.º 24
0
def test_grid_search_bad_param_grid():
    param_dict = {"C": 1.0}
    clf = SVC()

    with pytest.raises(ValueError):
        dcv.GridSearchCV(clf, param_dict)

    param_dict = {"C": []}
    clf = SVC()

    with pytest.raises(ValueError):
        dcv.GridSearchCV(clf, param_dict)

    param_dict = {"C": "1,2,3"}
    clf = SVC()

    with pytest.raises(ValueError):
        dcv.GridSearchCV(clf, param_dict)

    param_dict = {"C": np.ones(6).reshape(3, 2)}
    clf = SVC()
    with pytest.raises(ValueError):
        dcv.GridSearchCV(clf, param_dict)
def test_return_train_score_warn():
    # Test that warnings are raised. Will be removed in sklearn 0.21
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)
    X = (X - X.mean(0)) / X.std(0)  # help convergence
    grid = {"C": [0.1, 0.5]}

    for val in [True, False]:
        est = dcv.GridSearchCV(LinearSVC(random_state=0, tol=0.5),
                               grid,
                               return_train_score=val)
        with pytest.warns(None) as warns:
            results = est.fit(X, y).cv_results_
        assert not warns
        assert type(results) is dict

    est = dcv.GridSearchCV(LinearSVC(random_state=0), grid)
    with pytest.warns(None) as warns:
        results = est.fit(X, y).cv_results_
    assert not warns

    train_keys = {
        "split0_train_score",
        "split1_train_score",
        "split2_train_score",
        "mean_train_score",
        "std_train_score",
    }

    for key in results:
        if key in train_keys:
            with pytest.warns(FutureWarning):
                results[key]
        else:
            with pytest.warns(None) as warns:
                results[key]
            assert not warns
def test_grid_search_failing_classifier_raise():
    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
    clf = FailingClassifier()

    # refit=False because we want to test the behaviour of the grid search part
    gs = dcv.GridSearchCV(clf, [{
        'parameter': [0, 1, 2]
    }],
                          scoring='accuracy',
                          refit=False,
                          error_score='raise')

    # FailingClassifier issues a ValueError so this is what we look for.
    with pytest.raises(ValueError):
        gs.fit(X, y)
Ejemplo n.º 27
0
def test_cache_cv():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    X2 = X.view(CountTakes)
    gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                          cv=3,
                          cache_cv=False,
                          scheduler='sync')
    gs.fit(X2, y)
    assert X2.count == 2 * 3 * 3  # (1 train + 1 test) * n_params * n_splits

    X2 = X.view(CountTakes)
    assert X2.count == 0
    gs.cache_cv = True
    gs.fit(X2, y)
    assert X2.count == 2 * 3  # (1 test + 1 train) * n_splits
Ejemplo n.º 28
0
def test_gridsearch_with_arraylike_fit_param(cache_cv):
    # https://github.com/dask/dask-ml/issues/319
    X, y = make_classification(random_state=0)
    param_grid = {"foo_param": [0.0001, 0.1]}

    a = dcv.GridSearchCV(
        MockClassifierWithFitParam(),
        param_grid,
        cv=3,
        refit=False,
        cache_cv=cache_cv,
    )
    b = GridSearchCV(MockClassifierWithFitParam(), param_grid, cv=3, refit=False)

    b.fit(X, y, mock_fit_param=[0, 1])
    a.fit(X, y, mock_fit_param=[0, 1])
Ejemplo n.º 29
0
def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
    assert_array_almost_equal(grid_search.predict(X),
                              grid_search_pickled.predict(X))

    random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [1, 2, 3]},
                                           refit=True,
                                           n_iter=3)
    random_search.fit(X, y)
    random_search_pickled = pickle.loads(pickle.dumps(random_search))
    assert_array_almost_equal(random_search.predict(X),
                              random_search_pickled.predict(X))
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0)

    estimators = [
        DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0)
    ]

    # Test with grid search cv
    for est in estimators:
        grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_['split%d_test_score' % i][cand_i])

    # Test with a randomized search
    for est in estimators:
        random_search = dcv.RandomizedSearchCV(est,
                                               est_parameters,
                                               cv=cv,
                                               n_iter=3)
        random_search.fit(X, y)
        res_params = random_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_['split%d_test_score' %
                                              i][cand_i])