Ejemplo n.º 1
0
    def test_grid_search_score_method(self):
        X, y = make_classification(n_samples=100,
                                   n_classes=2,
                                   flip_y=0.2,
                                   random_state=0)
        clf = LinearSVC(random_state=0)
        grid = {"C": [0.1]}

        search_no_scoring = tcv.TuneGridSearchCV(clf, grid,
                                                 scoring=None).fit(X, y)
        search_accuracy = tcv.TuneGridSearchCV(clf, grid,
                                               scoring="accuracy").fit(X, y)
        search_no_score_method_auc = tcv.TuneGridSearchCV(
            LinearSVCNoScore(), grid, scoring="roc_auc").fit(X, y)
        search_auc = tcv.TuneGridSearchCV(clf, grid,
                                          scoring="roc_auc").fit(X, y)

        # Check warning only occurs in situation where behavior changed:
        # estimator requires score method to compete with scoring parameter
        score_no_scoring = search_no_scoring.score(X, y)
        score_accuracy = search_accuracy.score(X, y)
        score_no_score_auc = search_no_score_method_auc.score(X, y)
        score_auc = search_auc.score(X, y)

        # ensure the test is sane
        self.assertTrue(score_auc < 1.0)
        self.assertTrue(score_accuracy < 1.0)
        self.assertTrue(score_auc != score_accuracy)

        assert_almost_equal(score_accuracy, score_no_scoring)
        assert_almost_equal(score_auc, score_no_score_auc)
Ejemplo n.º 2
0
    def test_classes__property(self):
        # Test that classes_ property matches best_estimator_.classes_
        X = np.arange(100).reshape(10, 10)
        y = np.array([0] * 5 + [1] * 5)
        Cs = [0.1, 1, 10]

        grid_search = tcv.TuneGridSearchCV(LinearSVC(random_state=0),
                                           {"C": Cs})
        grid_search.fit(X, y)
        assert_array_equal(grid_search.best_estimator_.classes_,
                           grid_search.classes_)

        # Test that regressors do not have a classes_ attribute
        grid_search = tcv.TuneGridSearchCV(Ridge(), {"alpha": [1.0, 2.0]})
        grid_search.fit(X, y)
        self.assertFalse(hasattr(grid_search, "classes_"))

        # Test that the grid searcher has no classes_ attribute before it's fit
        grid_search = tcv.TuneGridSearchCV(LinearSVC(random_state=0),
                                           {"C": Cs})
        self.assertFalse(hasattr(grid_search, "classes_"))

        # Test that the grid searcher has no classes_ attribute without a refit
        grid_search = tcv.TuneGridSearchCV(LinearSVC(random_state=0),
                                           {"C": Cs},
                                           refit=False)
        grid_search.fit(X, y)
        self.assertFalse(hasattr(grid_search, "classes_"))
Ejemplo n.º 3
0
    def test_grid_search_groups(self):
        # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV
        # And also check if groups is correctly passed to the cv object
        rng = np.random.RandomState(0)

        X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
        groups = rng.randint(0, 3, 15)

        clf = LinearSVC(random_state=0)
        grid = {"C": [1]}

        group_cvs = [
            LeaveOneGroupOut(),
            LeavePGroupsOut(2),
            GroupKFold(n_splits=3),
            GroupShuffleSplit(n_splits=3),
        ]
        for cv in group_cvs:
            gs = tcv.TuneGridSearchCV(clf, grid, cv=cv)

            with pytest.raises(ValueError) as exc:
                assert gs.fit(X, y)
            self.assertTrue("parameter should not be None" in str(exc.value))

            gs.fit(X, y, groups=groups)

        non_group_cvs = [
            StratifiedKFold(n_splits=3),
            StratifiedShuffleSplit(n_splits=3)
        ]
        for cv in non_group_cvs:
            gs = tcv.TuneGridSearchCV(clf, grid, cv=cv)
            # Should not raise an error
            gs.fit(X, y)
Ejemplo n.º 4
0
    def test_grid_search_bad_param_grid(self):
        param_dict = {"C": 1.0}
        clf = SVC()

        with pytest.raises(ValueError) as exc:
            tcv.TuneGridSearchCV(clf, param_dict)
        self.assertTrue(
            ("Parameter values for parameter (C) need to be a sequence"
             "(but not a string) or np.ndarray.") in str(exc.value))

        param_dict = {"C": []}
        clf = SVC()

        with pytest.raises(ValueError) as exc:
            tcv.TuneGridSearchCV(clf, param_dict)
        self.assertTrue(
            ("Parameter values for parameter (C) need to be a non-empty "
             "sequence.") in str(exc.value))

        param_dict = {"C": "1,2,3"}
        clf = SVC()

        with pytest.raises(ValueError) as exc:
            tcv.TuneGridSearchCV(clf, param_dict)
        self.assertTrue(
            ("Parameter values for parameter (C) need to be a sequence"
             "(but not a string) or np.ndarray.") in str(exc.value))

        param_dict = {"C": np.ones(6).reshape(3, 2)}
        clf = SVC()
        with pytest.raises(ValueError):
            tcv.TuneGridSearchCV(clf, param_dict)
Ejemplo n.º 5
0
    def test_digits(self):
        # Loading the Digits dataset
        digits = datasets.load_digits()

        # To apply an classifier on this data, we need to flatten the image, to
        # turn the data in a (samples, feature) matrix:
        n_samples = len(digits.images)
        X = digits.images.reshape((n_samples, -1))
        y = digits.target

        # Split the dataset in two equal parts
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5,
                                                            random_state=0)

        # Set the parameters by cross-validation
        tuned_parameters = {
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000]
        }

        tune_search = tcv.TuneGridSearchCV(SVC(),
                                           tuned_parameters,
                                           scheduler=MedianStoppingRule(),
                                           iters=20)
        tune_search.fit(X_train, y_train)

        pred = tune_search.predict(X_test)
        print(pred)
        accuracy = np.count_nonzero(
            np.array(pred) == np.array(y_test)) / len(pred)
        print(accuracy)
Ejemplo n.º 6
0
    def test_pandas_input(self):
        # check cross_val_score doesn't destroy pandas dataframe
        types = [(MockDataFrame, MockDataFrame)]
        try:
            from pandas import Series, DataFrame

            types.append((DataFrame, Series))
        except ImportError:
            pass

        X = np.arange(100).reshape(10, 10)
        y = np.array([0] * 5 + [1] * 5)

        for InputFeatureType, TargetType in types:
            # X dataframe, y series
            X_df, y_ser = InputFeatureType(X), TargetType(y)
            clf = CheckingClassifier(
                check_X=lambda x: isinstance(x, InputFeatureType),
                check_y=lambda x: isinstance(x, TargetType),
            )

            grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]})
            grid_search.fit(X_df, y_ser).score(X_df, y_ser)
            grid_search.predict(X_df)
            self.assertTrue(hasattr(grid_search, "cv_results_"))
Ejemplo n.º 7
0
    def test_grid_search_precomputed_kernel(self):
        # Test that grid search works when the input features are given in the
        # form of a precomputed kernel matrix
        X_, y_ = make_classification(n_samples=200,
                                     n_features=100,
                                     random_state=0)

        # compute the training kernel matrix corresponding to the linear kernel
        K_train = np.dot(X_[:180], X_[:180].T)
        y_train = y_[:180]

        clf = SVC(kernel="precomputed")
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]})
        cv.fit(K_train, y_train)

        self.assertTrue(cv.best_score_ >= 0)

        # compute the test kernel matrix
        K_test = np.dot(X_[180:], X_[:180].T)
        y_test = y_[180:]

        y_pred = cv.predict(K_test)

        self.assertTrue(np.mean(y_pred == y_test) >= 0)

        # test error is raised when the precomputed kernel is not array-like
        # or sparse
        with pytest.raises(ValueError):
            cv.fit(K_train.tolist(), y_train)
Ejemplo n.º 8
0
    def test_unsupervised_grid_search(self):
        # test grid-search with unsupervised estimator
        X, y = make_blobs(random_state=0)
        km = KMeans(random_state=0)
        grid_search = tcv.TuneGridSearchCV(
            km,
            param_grid=dict(n_clusters=[2, 3, 4]),
            scoring="adjusted_rand_score")
        grid_search.fit(X, y)
        # ARI can find the right number :)
        self.assertEqual(grid_search.best_params_["n_clusters"], 3)

        # Now without a score, and without y
        grid_search = tcv.TuneGridSearchCV(
            km, param_grid=dict(n_clusters=[2, 3, 4]))
        grid_search.fit(X)
        self.assertEqual(grid_search.best_params_["n_clusters"], 4)
Ejemplo n.º 9
0
 def test_grid_search_precomputed_kernel_error_nonsquare(self):
     # Test that grid search returns an error with a non-square precomputed
     # training kernel matrix
     K_train = np.zeros((10, 20))
     y_train = np.ones((10, ))
     clf = SVC(kernel="precomputed")
     cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]})
     with pytest.raises(ValueError):
         cv.fit(K_train, y_train)
Ejemplo n.º 10
0
    def test_grid_search_error(self):
        # Test that grid search will capture errors on data with different length
        X_, y_ = make_classification(n_samples=200,
                                     n_features=100,
                                     random_state=0)

        clf = LinearSVC()
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]})
        with pytest.raises(ValueError):
            cv.fit(X_[:180], y_)
Ejemplo n.º 11
0
    def test_y_as_list(self):
        # Pass y as list in dcv.GridSearchCV
        X = np.arange(100).reshape(10, 10)
        y = np.array([0] * 5 + [1] * 5)

        clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
        cv = KFold(n_splits=3)
        grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]},
                                           cv=cv)
        grid_search.fit(X, y.tolist()).score(X, y)
        self.assertTrue(hasattr(grid_search, "cv_results_"))
Ejemplo n.º 12
0
 def test_gridsearch_nd(self):
     # Pass X as list in dcv.GridSearchCV
     X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
     y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
     clf = CheckingClassifier(
         check_X=lambda x: x.shape[1:] == (5, 3, 2),
         check_y=lambda x: x.shape[1:] == (7, 11),
     )
     grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]})
     grid_search.fit(X_4d, y_3d).score(X, y)
     self.assertTrue(hasattr(grid_search, "cv_results_"))
Ejemplo n.º 13
0
    def test_refit(self):
        # Regression test for bug in refitting
        # Simulates re-fitting a broken estimator; this used to break with
        # sparse SVMs.
        X = np.arange(100).reshape(10, 10)
        y = np.array([0] * 5 + [1] * 5)

        clf = tcv.TuneGridSearchCV(BrokenClassifier(), {"parameter": [0, 1]},
                                   scoring="accuracy",
                                   refit=True)
        clf.fit(X, y)
Ejemplo n.º 14
0
    def test_trivial_cv_results_attr(self):
        # Test search over a "grid" with only one point.
        # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV.
        clf = MockClassifier()
        grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1]})
        grid_search.fit(X, y)
        self.assertTrue(hasattr(grid_search, "cv_results_"))

        random_search = tcv.TuneRandomizedSearchCV(clf, {"foo_param": [0]},
                                                   iters=1)
        random_search.fit(X, y)
        self.assertTrue(hasattr(grid_search, "cv_results_"))
Ejemplo n.º 15
0
    def test_grid_search_sparse(self):
        # Test that grid search works with both dense and sparse matrices
        X_, y_ = make_classification(n_samples=200,
                                     n_features=100,
                                     random_state=0)

        clf = LinearSVC()
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]})
        cv.fit(X_[:180], y_[:180])
        y_pred = cv.predict(X_[180:])
        C = cv.best_estimator_.C

        X_ = sp.csr_matrix(X_)
        clf = LinearSVC()
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]})
        cv.fit(X_[:180].tocoo(), y_[:180])
        y_pred2 = cv.predict(X_[180:])
        C2 = cv.best_estimator_.C

        self.assertTrue(np.mean(y_pred == y_pred2) >= 0.9)
        self.assertEqual(C, C2)
Ejemplo n.º 16
0
    def test_grid_search_one_grid_point(self):
        X_, y_ = make_classification(n_samples=200,
                                     n_features=100,
                                     random_state=0)
        param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

        clf = SVC()
        cv = tcv.TuneGridSearchCV(clf, param_dict)
        cv.fit(X_, y_)

        clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
        clf.fit(X_, y_)

        assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
Ejemplo n.º 17
0
    def test_grid_search_sparse_scoring(self):
        X_, y_ = make_classification(n_samples=200,
                                     n_features=100,
                                     random_state=0)

        clf = LinearSVC()
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
        cv.fit(X_[:180], y_[:180])
        y_pred = cv.predict(X_[180:])
        C = cv.best_estimator_.C

        X_ = sp.csr_matrix(X_)
        clf = LinearSVC()
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1")
        cv.fit(X_[:180], y_[:180])
        y_pred2 = cv.predict(X_[180:])
        C2 = cv.best_estimator_.C

        assert_array_equal(y_pred, y_pred2)
        self.assertEqual(C, C2)

        # Smoke test the score
        # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
        #                            cv.score(X_[:180], y[:180]))

        # test loss where greater is worse
        def f1_loss(y_true_, y_pred_):
            return -f1_score(y_true_, y_pred_)

        F1Loss = make_scorer(f1_loss, greater_is_better=False)
        cv = tcv.TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss)
        cv.fit(X_[:180], y_[:180])
        y_pred3 = cv.predict(X_[180:])
        C3 = cv.best_estimator_.C

        self.assertEqual(C, C3)
        assert_array_equal(y_pred, y_pred3)
Ejemplo n.º 18
0
    def test_grid_search_no_score(self):
        # Test grid-search on classifier that has no score function.
        clf = LinearSVC(random_state=0)
        X, y = make_blobs(random_state=0, centers=2)
        Cs = [0.1, 1, 10]
        clf_no_score = LinearSVCNoScore(random_state=0)

        # XXX: It seems there's some global shared state in LinearSVC - fitting
        # multiple `SVC` instances in parallel using threads sometimes results in
        # wrong results. This only happens with threads, not processes/sync.
        # For now, we'll fit using the sync scheduler.
        grid_search = tcv.TuneGridSearchCV(clf, {"C": Cs},
                                           scoring="accuracy",
                                           scheduler=MedianStoppingRule())
        grid_search.fit(X, y)

        grid_search_no_score = tcv.TuneGridSearchCV(
            clf_no_score, {"C": Cs},
            scoring="accuracy",
            scheduler=MedianStoppingRule())
        # smoketest grid search
        grid_search_no_score.fit(X, y)

        # check that best params are equal
        self.assertEqual(grid_search_no_score.best_params,
                         grid_search.best_params_)
        # check that we can call score and that it gives the correct result
        self.assertEqual(grid_search.score(X, y),
                         grid_search_no_score.score(X, y))

        # giving no scoring function raises an error
        grid_search_no_score = tcv.TuneGridSearchCV(clf_no_score, {"C": Cs})
        with self.assertRaises(TypeError) as exc:
            grid_search_no_score.fit([[1]])

        self.assertTrue("no scoring" in str(exc.exception))
Ejemplo n.º 19
0
    def test_gridsearch_no_predict(self):
        # test grid-search with an estimator without predict.
        # slight duplication of a test from KDE
        def custom_scoring(estimator, X):
            return 42 if estimator.bandwidth == 0.1 else 0

        X, _ = make_blobs(cluster_std=0.1,
                          random_state=1,
                          centers=[[0, 1], [1, 0], [0, 0]])
        search = tcv.TuneGridSearchCV(
            KernelDensity(),
            param_grid=dict(bandwidth=[0.01, 0.1, 1]),
            scoring=custom_scoring,
        )
        search.fit(X)
        self.assertEqual(search.best_params_["bandwidth"], 0.1)
        self.assertEqual(search.best_score_, 42)
Ejemplo n.º 20
0
    def test_grid_search(self):
        # Test that the best estimator contains the right value for foo_param
        clf = MockClassifier()
        grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]})
        # make sure it selects the smallest parameter in case of ties
        grid_search.fit(X, y)
        self.assertEqual(grid_search.best_estimator_.foo_param, 2)

        assert_array_equal(grid_search.cv_results_["param_foo_param"].data,
                           [1, 2, 3])

        # Smoke test the score etc:
        grid_search.score(X, y)
        grid_search.predict_proba(X)
        grid_search.decision_function(X)
        grid_search.transform(X)

        # Test exception handling on scoring
        grid_search.scoring = "sklearn"
        with pytest.raises(ValueError):
            grid_search.fit(X, y)
Ejemplo n.º 21
0
    def test_diabetes(self):
        # load the diabetes datasets
        dataset = datasets.load_diabetes()
        X = dataset.data
        y = dataset.target
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5,
                                                            random_state=0)
        # prepare a range of alpha values to test
        alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
        param_grid = dict(alpha=alphas)
        # create and fit a ridge regression model, testing each alpha
        model = linear_model.Ridge()

        tune_search = tcv.TuneGridSearchCV(model, param_grid,
                                           MedianStoppingRule())
        tune_search.fit(X_train, y_train)

        pred = tune_search.predict(X_test)
        print(pred)
        error = sum(np.array(pred) - np.array(y_test)) / len(pred)
        print(error)
Ejemplo n.º 22
0
    def test_no_refit(self):
        # Test that GSCV can be used for model selection alone without refitting
        clf = MockClassifier()
        grid_search = tcv.TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]},
                                           refit=False)
        grid_search.fit(X, y)
        self.assertFalse(hasattr(grid_search, "best_estimator_"))
        self.assertFalse(hasattr(grid_search, "best_index_"))
        self.assertFalse(hasattr(grid_search, "best_score_"))
        self.assertFalse(hasattr(grid_search, "best_params_"))

        # Make sure the predict/transform etc fns raise meaningfull error msg
        for fn_name in (
                "predict",
                "predict_proba",
                "predict_log_proba",
                "transform",
                "inverse_transform",
        ):
            with pytest.raises(NotFittedError) as exc:
                getattr(grid_search, fn_name)(X)
            self.assertTrue(
                ("refit=False. %s is available only after refitting on the "
                 "best parameters" % fn_name) in str(exc.value))