Esempio n. 1
0
def test_grid_search_no_refit():
    # Test that grid search can be used for model selection only
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
    grid_search.fit(X, y)
    assert hasattr(grid_search, "best_params_")
    assert not hasattr(grid_search, "best_estimator_")
Esempio n. 2
0
def test_grid_search_numpy_inputs():
    # Test that the best estimator contains the right value for foo_param
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
    # make sure it selects the smallest parameter in case of ties
    grid_search.fit(X, y)
    assert grid_search.best_estimator_.foo_param == 2

    for i, foo_i in enumerate([1, 2, 3]):
        assert grid_search.grid_scores_[i][0] == {'foo_param': foo_i}
Esempio n. 3
0
def test_grid_search_one_grid_point():
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)
    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

    clf = SVC()
    cv = GridSearchCV(clf, param_dict)
    cv.fit(X, y)

    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
    clf.fit(X, y)

    tm.assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
Esempio n. 4
0
def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                tm.assert_almost_equal(correct_score, scores[i])
                i += 1
Esempio n. 5
0
def test_grid_search_dask_inputs_dk_est():
    X, y = make_classification(n_samples=1000, n_features=100, random_state=0)
    dX = da.from_array(X, chunks=100)
    dy = da.from_array(y, chunks=100)
    grid = {'alpha': [0.1, 0.01, 0.0001]}

    clf = SGDClassifier()
    d_clf = Chained(SGDClassifier())
    grid_search = GridSearchCV(clf, grid)
    d_grid_search = GridSearchCV(d_clf, grid, fit_params={'classes': [0, 1]})

    grid_search.fit(X, y)
    d_grid_search.fit(dX, dy)
    assert (d_grid_search.best_estimator_.alpha ==
            grid_search.best_estimator_.alpha)
Esempio n. 6
0
def test_unsupervised_grid_search():
    # test grid-search with unsupervised estimator
    X, y = make_blobs(random_state=0)
    km = KMeans(random_state=0)
    grid_search = GridSearchCV(km,
                               param_grid=dict(n_clusters=[2, 3, 4]),
                               scoring='adjusted_rand_score')
    grid_search.fit(X, y)
    # ARI can find the right number :)
    assert grid_search.best_params_["n_clusters"] == 3

    # Now without a score, and without y
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
    grid_search.fit(X)
    assert grid_search.best_params_["n_clusters"] == 4
Esempio n. 7
0
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)

    cv = GridSearchCV(LinearSVC(), {'C': [0.1, 1.0]})
    cv.fit(X[:180], y[:180])
    y_pred = cv.predict(X[180:])
    C = cv.best_estimator_.C

    X = sparse.csr_matrix(X)
    cv.fit(X[:180], y[:180])
    y_pred2 = cv.predict(X[180:])
    C2 = cv.best_estimator_.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert C == C2
Esempio n. 8
0
def test_grid_search_iid():
    # test the iid parameter
    # noise-free simple 2d-data
    X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]],
                      random_state=0,
                      cluster_std=0.1,
                      shuffle=False,
                      n_samples=80)
    # split dataset into two folds that are not iid
    # first one contains data of all 4 blobs, second only from two.
    mask = np.ones(X.shape[0], dtype=np.bool)
    mask[np.where(y == 1)[0][::2]] = 0
    mask[np.where(y == 2)[0][::2]] = 0
    # this leads to perfect classification on one fold and a score of 1/3 on
    # the other
    svm = SVC(kernel='linear')
    # create "cv" for splits
    cv = [[mask, ~mask], [~mask, mask]]
    # once with iid=True (default)
    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
    grid_search.fit(X, y)
    first = grid_search.grid_scores_[0]
    tm.assert_equal(first.parameters['C'], 1)
    tm.assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
    # for first split, 1/4 of dataset is in test, for second 3/4.
    # take weighted average
    tm.assert_almost_equal(first.mean_validation_score,
                           1 * 1. / 4. + 1. / 3. * 3. / 4.)

    # once with iid=False
    grid_search = GridSearchCV(svm,
                               param_grid={'C': [1, 10]},
                               cv=cv,
                               iid=False)
    grid_search.fit(X, y)
    first = grid_search.grid_scores_[0]
    assert first.parameters['C'] == 1
    # scores are the same as above
    tm.assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
    # averaged score is just mean of scores
    tm.assert_almost_equal(first.mean_validation_score,
                           np.mean(first.cv_validation_scores))
Esempio n. 9
0
def test_grid_search_dask_inputs():
    # Test that the best estimator contains the right value for foo_param
    dX = da.from_array(X, chunks=2)
    dy = da.from_array(y, chunks=2)
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
    # make sure it selects the smallest parameter in case of ties
    grid_search.fit(dX, dy)
    assert grid_search.best_estimator_.foo_param == 2

    for i, foo_i in enumerate([1, 2, 3]):
        assert grid_search.grid_scores_[i][0] == {'foo_param': foo_i}

    y_pred = grid_search.predict(dX)
    assert isinstance(y_pred, da.Array)
    tm.assert_array_equal(y_pred, X.sum(axis=1))

    y_pred = grid_search.predict(X)
    assert isinstance(y_pred, np.ndarray)
    tm.assert_array_equal(y_pred, X.sum(axis=1))