def test_pipeline_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('step1', MockClassifier()), ('step2', MockClassifier())]) grid = {'step3__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {'steps': [[('one', MockClassifier()), ('two', MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_bad_error_score(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, error_score='badparam') with pytest.raises(ValueError): gs.fit(X, y)
def test_feature_union_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) union = FeatureUnion([('tr0', MockClassifier()), ('tr1', MockClassifier())]) pipe = Pipeline([('union', union), ('est', MockClassifier())]) grid = {'union__tr2__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {'union__transformer_list': [[('one', MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop, set_as_default=False) as client: gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=client) gs.fit(X, y)
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_pipeline_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('bad', FailingClassifier()), ('good1', MockClassifier()), ('good2', MockClassifier())]) grid = {'bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, 'bad__parameter')
def test_trivial_cv_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV. clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {'foo_param': [1]}) grid_search.fit(X, y) assert hasattr(grid_search, "cv_results_") random_search = dcv.RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) random_search.fit(X, y) assert hasattr(grid_search, "cv_results_")
def test_scheduler_param(scheduler, n_jobs, get): if scheduler == 'multiprocessing': mp = pytest.importorskip('dask.multiprocessing') get = mp.get assert _normalize_scheduler(scheduler, n_jobs) is get X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=scheduler, n_jobs=n_jobs) gs.fit(X, y)
def test_cache_cv(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) X2 = X.view(CountTakes) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, cache_cv=False, scheduler='sync') gs.fit(X2, y) assert X2.count == 2 * 3 * 3 # (1 train + 1 test) * n_params * n_splits X2 = X.view(CountTakes) assert X2.count == 0 gs.cache_cv = True gs.fit(X2, y) assert X2.count == 2 * 3 # (1 test + 1 train) * n_splits
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = dcv.RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))
def test_feature_union_fit_failure_multiple_metrics(): scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer} X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()), ('bad', FailingClassifier())], transformer_weights={'bad': 0.5})), ('clf', MockClassifier())]) grid = {'union__bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) for key in scoring: check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
def test_no_refit(): # Test that GSCV can be used for model selection alone without refitting clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) grid_search.fit(X, y) assert (not hasattr(grid_search, "best_estimator_") and hasattr(grid_search, "best_index_") and hasattr(grid_search, "best_params_")) # Make sure the predict/transform etc fns raise meaningfull error msg for fn_name in ('predict', 'predict_proba', 'predict_log_proba', 'transform', 'inverse_transform'): with pytest.raises(NotFittedError) as exc: getattr(grid_search, fn_name)(X) assert (('refit=False. %s is available only after refitting on the ' 'best parameters' % fn_name) in str(exc.value))
def test_grid_search(): # Test that the best estimator contains the right value for foo_param clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {'foo_param': [1, 2, 3]}) # make sure it selects the smallest parameter in case of ties grid_search.fit(X, y) assert grid_search.best_estimator_.foo_param == 2 assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3]) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X) # Test exception handling on scoring grid_search.scoring = 'sklearn' with pytest.raises(ValueError): grid_search.fit(X, y)