Exemple #1
0
def cv(model, param_grid, X, y, n_iter=20):
    """ Cross validation """

    cv_results = None
    print('..performing cv search...')
    searches = []

    # Define jobs
    random_search = dcv.RandomizedSearchCV(
        model,
        param_grid,
        n_iter=n_iter,
        cv=5,
        scoring=['f1_macro'],  #, 'accuracy'],
        return_train_score=True,
        refit=False).fit(X, y)
    # Gather results
    cv_results = pd.DataFrame(random_search.cv_results_)  #.head(1)
    cv_results.sort_values(by=['mean_test_f1_macro'],
                           inplace=True,
                           ascending=False,
                           ignore_index=True)
    print(cv_results.head())

    best_params = cv_results.loc[0, 'params']
    model = model.set_params(**best_params)

    print('Using configuration: {}'.format(best_params))

    with joblib.parallel_backend('dask'):
        model.fit(X, y)

    return model, cv_results
def test_random_search_cv_results():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200,
                               n_features=100,
                               n_informative=3,
                               random_state=0)

    # scipy.stats dists now supports `seed` but we still support scipy 0.12
    # which doesn't support the seed. Hence the assertions in the test for
    # random_search alone should not depend on randomization.
    n_splits = 3
    n_search_iter = 30
    params = dict(C=expon(scale=10), gamma=expon(scale=0.1))
    random_search = dcv.RandomizedSearchCV(SVC(),
                                           n_iter=n_search_iter,
                                           cv=n_splits,
                                           iid=False,
                                           param_distributions=params,
                                           return_train_score=True)
    random_search.fit(X, y)
    random_search_iid = dcv.RandomizedSearchCV(SVC(),
                                               n_iter=n_search_iter,
                                               cv=n_splits,
                                               iid=True,
                                               param_distributions=params,
                                               return_train_score=True)
    random_search_iid.fit(X, y)

    param_keys = ('param_C', 'param_gamma')
    score_keys = ('mean_test_score', 'mean_train_score', 'rank_test_score',
                  'split0_test_score', 'split1_test_score',
                  'split2_test_score', 'split0_train_score',
                  'split1_train_score', 'split2_train_score', 'std_test_score',
                  'std_train_score', 'mean_fit_time', 'std_fit_time',
                  'mean_score_time', 'std_score_time')
    n_cand = n_search_iter

    for search, iid in zip((random_search, random_search_iid), (False, True)):
        assert iid == search.iid
        cv_results = search.cv_results_
        # Check results structure
        check_cv_results_array_types(cv_results, param_keys, score_keys)
        check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
        # For random_search, all the param array vals should be unmasked
        assert not (any(cv_results['param_C'].mask)
                    or any(cv_results['param_gamma'].mask))
def test_search_basic(xy_classification):
    X, y = xy_classification
    param_grid = {"class_weight": [None, "balanced"]}

    a = dms.GridSearchCV(SVC(kernel="rbf", gamma=0.1), param_grid)
    a.fit(X, y)

    param_dist = {"C": stats.uniform}
    b = dms.RandomizedSearchCV(SVC(kernel="rbf", gamma=0.1), param_dist)
    b.fit(X, y)
Exemple #4
0
def test_search_basic(xy_classification):
    X, y = xy_classification
    param_grid = {'class_weight': [None, 'balanced']}

    a = dms.GridSearchCV(SVC(kernel='rbf'), param_grid)
    a.fit(X, y)

    param_dist = {'C': stats.uniform}
    b = dms.RandomizedSearchCV(SVC(kernel='rbf'), param_dist)
    b.fit(X, y)
Exemple #5
0
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0, n_splits=3, shuffle=True)

    estimators = [
        DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0),
    ]

    scoring = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score,
                                          average="weighted")
    # Test with grid search cv
    for est in estimators:
        grid_search = dcv.GridSearchCV(est,
                                       est_parameters,
                                       cv=cv,
                                       scoring=scoring)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_["params"]
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = scoring(est, X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_["split%d_test_score" % i][cand_i],
                )

    # Test with a randomized search
    for est in estimators:
        random_search = dcv.RandomizedSearchCV(est,
                                               est_parameters,
                                               cv=cv,
                                               n_iter=3,
                                               scoring=scoring)
        random_search.fit(X, y)
        res_params = random_search.cv_results_["params"]
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = scoring(est, X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_["split%d_test_score" %
                                              i][cand_i],
                )
Exemple #6
0
def test_trivial_cv_results_attr():
    # Test search over a "grid" with only one point.
    # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV.
    clf = MockClassifier()
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1]})
    grid_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")

    random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1)
    random_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")
Exemple #7
0
def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
    assert_array_almost_equal(grid_search.predict(X),
                              grid_search_pickled.predict(X))

    random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [1, 2, 3]},
                                           refit=True,
                                           n_iter=3)
    random_search.fit(X, y)
    random_search_pickled = pickle.loads(pickle.dumps(random_search))
    assert_array_almost_equal(random_search.predict(X),
                              random_search_pickled.predict(X))
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0)

    estimators = [
        DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0)
    ]

    # Test with grid search cv
    for est in estimators:
        grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_['split%d_test_score' % i][cand_i])

    # Test with a randomized search
    for est in estimators:
        random_search = dcv.RandomizedSearchCV(est,
                                               est_parameters,
                                               cv=cv,
                                               n_iter=3)
        random_search.fit(X, y)
        res_params = random_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_['split%d_test_score' %
                                              i][cand_i])
def fit_knn(X, y, n_iter):
    """Fit a KNN model on geographical coordinates only"""
    columns_tf = make_column_transformer(("passthrough", ["X", "Y"]))
    model = make_pipeline(columns_tf, KNeighborsClassifier())

    param_space = {
        "kneighborsclassifier__n_neighbors": loguniform_int(1, 500),
        "kneighborsclassifier__weights": ["uniform", "distance"],
    }
    model = dcv.RandomizedSearchCV(model,
                                   param_space,
                                   scoring="neg_log_loss",
                                   n_iter=n_iter,
                                   random_state=42,
                                   cv=5)

    model.fit(X, y)
    return model
Exemple #10
0
def test_search_cv_results_rank_tie_breaking():
    X, y = make_blobs(n_samples=50, random_state=42)

    # The two C values are close enough to give similar models
    # which would result in a tie of their mean cv-scores
    param_grid = {"C": [1, 1.001, 0.001]}

    grid_search = dcv.GridSearchCV(
        SVC(gamma="auto"), param_grid=param_grid, return_train_score=True
    )
    random_search = dcv.RandomizedSearchCV(
        SVC(gamma="auto"),
        n_iter=3,
        param_distributions=param_grid,
        return_train_score=True,
    )

    for search in (grid_search, random_search):
        search.fit(X, y)
        cv_results = search.cv_results_
        # Check tie breaking strategy -
        # Check that there is a tie in the mean scores between
        # candidates 1 and 2 alone
        assert_almost_equal(
            cv_results["mean_test_score"][0], cv_results["mean_test_score"][1]
        )
        assert_almost_equal(
            cv_results["mean_train_score"][0], cv_results["mean_train_score"][1]
        )
        try:
            assert_almost_equal(
                cv_results["mean_test_score"][1], cv_results["mean_test_score"][2]
            )
        except AssertionError:
            pass
        try:
            assert_almost_equal(
                cv_results["mean_train_score"][1], cv_results["mean_train_score"][2]
            )
        except AssertionError:
            pass
        # 'min' rank should be assigned to the tied candidates
        assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3])
Exemple #11
0
def fit_linear(X, y, n_iter):
    """Fit a logistic regression model"""
    model = LogisticRegression(max_iter=500,
                               penalty="elasticnet",
                               solver="saga")
    model = make_pipeline(columns_transform(), model)

    param_space = {
        "logisticregression__l1_ratio": st.uniform(0, 1),
        "logisticregression__C": st.loguniform(1e-4, 1e4),
    }
    model = dcv.RandomizedSearchCV(model,
                                   param_space,
                                   scoring="neg_log_loss",
                                   n_iter=n_iter,
                                   random_state=42,
                                   cv=5)

    model.fit(X, y)
    return model
Exemple #12
0
def fit_gbdt(X, y, n_iter):
    """Fit a gradient boosted decision trees model"""
    model = LGBMClassifier(n_estimators=2000, random_state=42)
    model = make_pipeline(columns_transform(), model)

    param_space = {
        "lgbmclassifier__min_data_in_leaf": loguniform_int(5, 500),
        "lgbmclassifier__num_leaves": loguniform_int(31, 500),
        "lgbmclassifier__reg_alpha": st.loguniform(1e-10, 1.0),
        "lgbmclassifier__reg_lambda": st.loguniform(1e-10, 1.0),
        "lgbmclassifier__learning_rate": st.loguniform(1e-4, 1e-1),
    }
    model = dcv.RandomizedSearchCV(model,
                                   param_space,
                                   scoring="neg_log_loss",
                                   n_iter=n_iter,
                                   random_state=42,
                                   cv=5)

    model.fit(X, y)
    return model
Exemple #13
0
def fit_mlp(X, y, n_iter):
    """Fit a simple multi-layer perceptron model"""
    model = MLPClassifier(random_state=42, early_stopping=True)
    model = make_pipeline(columns_transform(), model)

    layers_options = [
        [n_units] * n_layers
        for n_units, n_layers in it.product([32, 64, 128, 256, 512], [1, 2])
    ]
    param_space = {
        "mlpclassifier__hidden_layer_sizes": layers_options,
        "mlpclassifier__alpha": st.loguniform(1e-5, 1e-2),
        "mlpclassifier__learning_rate_init": st.loguniform(1e-4, 1e-1),
    }
    model = dcv.RandomizedSearchCV(model,
                                   param_space,
                                   scoring="neg_log_loss",
                                   n_iter=n_iter,
                                   random_state=42,
                                   cv=5)

    model.fit(X, y)
    return model
Exemple #14
0
def test_search_iid_param():
    # Test the IID parameter
    # noise-free simple 2d-data
    X, y = make_blobs(
        centers=[[0, 0], [1, 0], [0, 1], [1, 1]],
        random_state=0,
        cluster_std=0.1,
        shuffle=False,
        n_samples=80,
    )
    # split dataset into two folds that are not iid
    # first one contains data of all 4 blobs, second only from two.
    mask = np.ones(X.shape[0], dtype=np.bool)
    mask[np.where(y == 1)[0][::2]] = 0
    mask[np.where(y == 2)[0][::2]] = 0
    # this leads to perfect classification on one fold and a score of 1/3 on
    # the other
    # create "cv" for splits
    cv = [[mask, ~mask], [~mask, mask]]
    # once with iid=True (default)
    grid_search = dcv.GridSearchCV(SVC(gamma="auto"),
                                   param_grid={"C": [1, 10]},
                                   cv=cv,
                                   return_train_score=True)
    random_search = dcv.RandomizedSearchCV(
        SVC(gamma="auto"),
        n_iter=2,
        param_distributions={"C": [1, 10]},
        return_train_score=True,
        cv=cv,
    )
    for search in (grid_search, random_search):
        search.fit(X, y)
        assert search.iid

        test_cv_scores = np.array(
            list(search.cv_results_["split%d_test_score" % s_i][0]
                 for s_i in range(search.n_splits_)))
        train_cv_scores = np.array(
            list(search.cv_results_["split%d_train_"
                                    "score" % s_i][0]
                 for s_i in range(search.n_splits_)))
        test_mean = search.cv_results_["mean_test_score"][0]
        test_std = search.cv_results_["std_test_score"][0]

        train_cv_scores = np.array(
            list(search.cv_results_["split%d_train_"
                                    "score" % s_i][0]
                 for s_i in range(search.n_splits_)))
        train_mean = search.cv_results_["mean_train_score"][0]
        train_std = search.cv_results_["std_train_score"][0]

        # Test the first candidate
        assert search.cv_results_["param_C"][0] == 1
        assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0])
        assert_array_almost_equal(train_cv_scores, [1, 1])

        # for first split, 1/4 of dataset is in test, for second 3/4.
        # take weighted average and weighted std
        expected_test_mean = 1 * 1.0 / 4.0 + 1.0 / 3.0 * 3.0 / 4.0
        expected_test_std = np.sqrt(1.0 / 4 * (expected_test_mean - 1)**2 +
                                    3.0 / 4 *
                                    (expected_test_mean - 1.0 / 3.0)**2)
        assert_almost_equal(test_mean, expected_test_mean)
        assert_almost_equal(test_std, expected_test_std)

        # For the train scores, we do not take a weighted mean irrespective of
        # i.i.d. or not
        assert_almost_equal(train_mean, 1)
        assert_almost_equal(train_std, 0)

    # once with iid=False
    grid_search = dcv.GridSearchCV(
        SVC(gamma="auto"),
        param_grid={"C": [1, 10]},
        cv=cv,
        iid=False,
        return_train_score=True,
    )
    random_search = dcv.RandomizedSearchCV(
        SVC(gamma="auto"),
        n_iter=2,
        param_distributions={"C": [1, 10]},
        cv=cv,
        iid=False,
        return_train_score=True,
    )

    for search in (grid_search, random_search):
        search.fit(X, y)
        assert not search.iid

        test_cv_scores = np.array(
            list(search.cv_results_["split%d_test_score" % s][0]
                 for s in range(search.n_splits_)))
        test_mean = search.cv_results_["mean_test_score"][0]
        test_std = search.cv_results_["std_test_score"][0]

        train_cv_scores = np.array(
            list(search.cv_results_["split%d_train_"
                                    "score" % s][0]
                 for s in range(search.n_splits_)))
        train_mean = search.cv_results_["mean_train_score"][0]
        train_std = search.cv_results_["std_train_score"][0]

        assert search.cv_results_["param_C"][0] == 1
        # scores are the same as above
        assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0])
        # Unweighted mean/std is used
        assert_almost_equal(test_mean, np.mean(test_cv_scores))
        assert_almost_equal(test_std, np.std(test_cv_scores))

        # For the train scores, we do not take a weighted mean irrespective of
        # i.i.d. or not
        assert_almost_equal(train_mean, 1)
        assert_almost_equal(train_std, 0)
Exemple #15
0
    'gbc__min_samples_leaf': min_samples_leafs,
}

# create model
gbc = Pipeline([('vect', CountVectorizer(analyzer=lambda x: x)),
                ('scale', MaxAbsScaler()),
                ('gbc', GradientBoostingClassifier(verbose=1,
                                                   random_state=7))])

print("Tuning Model")

# Cross validation
clf = dcv.RandomizedSearchCV(gbc,
                             tuned_parameters,
                             cv=5,
                             n_iter=n_iter,
                             refit=True,
                             scoring='accuracy',
                             cache_cv=True,
                             scheduler=client)

clf.fit(x_train, y_train)

# Print results

for param, score in zip(clf.cv_results_['params'],
                        clf.cv_results_['mean_test_score']):
    print(param, score)

print("the best model is" + str(clf.best_params_))

score = clf.best_estimator_.score(x_test, y_test)