def test_feature_union_fit_failure_multiple_metrics(): scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer} X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([ ( "union", FeatureUnion( [("good", MockClassifier()), ("bad", FailingClassifier())], transformer_weights={"bad": 0.5}, ), ), ("clf", MockClassifier()), ]) grid = {"union__bad__parameter": [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float("nan") with pytest.warns(FitFailedWarning): gs.fit(X, y) for key in scoring: check_scores_all_nan(gs, "union__bad__parameter", score_key=key)
def test_pipeline_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([ ("bad", FailingClassifier()), ("good1", MockClassifier()), ("good2", MockClassifier()), ]) grid = { "bad__parameter": [ 0, FailingClassifier.FAILING_PARAMETER, FailingClassifier.FAILING_PREDICT_PARAMETER, FailingClassifier.FAILING_SCORE_PARAMETER, ] } gs = dcv.GridSearchCV(pipe, grid, refit=False) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float("nan") with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, "bad__parameter")
def test_pipeline_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([("step1", MockClassifier()), ("step2", MockClassifier())]) grid = {"step3__parameter": [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {"steps": [[("one", MockClassifier()), ("two", MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_feature_union_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) union = FeatureUnion([("tr0", MockClassifier()), ("tr1", MockClassifier())]) pipe = Pipeline([("union", union), ("est", MockClassifier())]) grid = {"union__tr2__parameter": [0, 1, 2]} gs = dcv.GridSearchCV(pipe, param_grid=grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {"union__transformer_list": [[("one", MockClassifier())]]} gs = dcv.GridSearchCV(pipe, param_grid=grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_scheduler_param(scheduler, n_jobs): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=scheduler, n_jobs=n_jobs) gs.fit(X, y)
def test_feature_union_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) union = FeatureUnion([('tr0', MockClassifier()), ('tr1', MockClassifier())]) pipe = Pipeline([('union', union), ('est', MockClassifier())]) grid = {'union__tr2__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {'union__transformer_list': [[('one', MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_bad_error_score(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, error_score="badparam") with pytest.raises(ValueError): gs.fit(X, y)
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] imputer = SimpleImputer(strategy="mean", missing_values=np.nan) p = Pipeline([("imputer", imputer), ("classifier", MockClassifier())]) dcv.GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
def test_pipeline_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('bad', FailingClassifier()), ('good1', MockClassifier()), ('good2', MockClassifier())]) grid = {'bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, 'bad__parameter')
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_scheduler_param_distributed(loop): # noqa X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop) as client: gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3) gs.fit(X, y) def f(dask_scheduler): return len(dask_scheduler.transition_log) assert client.run_on_scheduler(f) # some work happened on cluster
def test_trivial_cv_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV. clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1]}) grid_search.fit(X, y) assert hasattr(grid_search, "cv_results_") random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1) random_search.fit(X, y) assert hasattr(grid_search, "cv_results_")
def test_cache_cv(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) X2 = X.view(CountTakes) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, cache_cv=False, scheduler='sync') gs.fit(X2, y) assert X2.count == 2 * 3 * 3 # (1 train + 1 test) * n_params * n_splits X2 = X.view(CountTakes) assert X2.count == 0 gs.cache_cv = True gs.fit(X2, y) assert X2.count == 2 * 3 # (1 test + 1 train) * n_splits
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] if SK_VERSION >= packaging.version.parse("0.20.0.dev0"): from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="mean", missing_values=np.nan) else: from sklearn.preprocessing import Imputer imputer = Imputer(strategy="mean", missing_values="NaN") p = Pipeline([("imputer", imputer), ("classifier", MockClassifier())]) dcv.GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
def test_no_refit(): # Test that GSCV can be used for model selection alone without refitting clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) grid_search.fit(X, y) assert not hasattr(grid_search, "best_estimator_") assert not hasattr(grid_search, "best_index_") assert not hasattr(grid_search, "best_score_") assert not hasattr(grid_search, "best_params_") # Make sure the predict/transform etc fns raise meaningfull error msg for fn_name in ('predict', 'predict_proba', 'predict_log_proba', 'transform', 'inverse_transform'): with pytest.raises(NotFittedError) as exc: getattr(grid_search, fn_name)(X) assert (('refit=False. %s is available only after refitting on the ' 'best parameters' % fn_name) in str(exc.value))
def test_grid_search(): # Test that the best estimator contains the right value for foo_param clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}) # make sure it selects the smallest parameter in case of ties grid_search.fit(X, y) assert grid_search.best_estimator_.foo_param == 2 assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3]) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X) # Test exception handling on scoring grid_search.scoring = "sklearn" with pytest.raises(ValueError): grid_search.fit(X, y)