def cv(model, param_grid, X, y, n_iter=20): """ Cross validation """ cv_results = None print('..performing cv search...') searches = [] # Define jobs random_search = dcv.RandomizedSearchCV( model, param_grid, n_iter=n_iter, cv=5, scoring=['f1_macro'], #, 'accuracy'], return_train_score=True, refit=False).fit(X, y) # Gather results cv_results = pd.DataFrame(random_search.cv_results_) #.head(1) cv_results.sort_values(by=['mean_test_f1_macro'], inplace=True, ascending=False, ignore_index=True) print(cv_results.head()) best_params = cv_results.loc[0, 'params'] model = model.set_params(**best_params) print('Using configuration: {}'.format(best_params)) with joblib.parallel_backend('dask'): model.fit(X, y) return model, cv_results
def test_random_search_cv_results(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. n_splits = 3 n_search_iter = 30 params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) random_search = dcv.RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_splits, iid=False, param_distributions=params, return_train_score=True) random_search.fit(X, y) random_search_iid = dcv.RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_splits, iid=True, param_distributions=params, return_train_score=True) random_search_iid.fit(X, y) param_keys = ('param_C', 'param_gamma') score_keys = ('mean_test_score', 'mean_train_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'std_test_score', 'std_train_score', 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time') n_cand = n_search_iter for search, iid in zip((random_search, random_search_iid), (False, True)): assert iid == search.iid cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(cv_results, param_keys, score_keys) check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) # For random_search, all the param array vals should be unmasked assert not (any(cv_results['param_C'].mask) or any(cv_results['param_gamma'].mask))
def test_search_basic(xy_classification): X, y = xy_classification param_grid = {"class_weight": [None, "balanced"]} a = dms.GridSearchCV(SVC(kernel="rbf", gamma=0.1), param_grid) a.fit(X, y) param_dist = {"C": stats.uniform} b = dms.RandomizedSearchCV(SVC(kernel="rbf", gamma=0.1), param_dist) b.fit(X, y)
def test_search_basic(xy_classification): X, y = xy_classification param_grid = {'class_weight': [None, 'balanced']} a = dms.GridSearchCV(SVC(kernel='rbf'), param_grid) a.fit(X, y) param_dist = {'C': stats.uniform} b = dms.RandomizedSearchCV(SVC(kernel='rbf'), param_dist) b.fit(X, y)
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0, n_splits=3, shuffle=True) estimators = [ DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0), ] scoring = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score, average="weighted") # Test with grid search cv for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv, scoring=scoring) grid_search.fit(X, y) res_params = grid_search.cv_results_["params"] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = scoring(est, X[test], y[test]) assert_almost_equal( correct_score, grid_search.cv_results_["split%d_test_score" % i][cand_i], ) # Test with a randomized search for est in estimators: random_search = dcv.RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3, scoring=scoring) random_search.fit(X, y) res_params = random_search.cv_results_["params"] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = scoring(est, X[test], y[test]) assert_almost_equal( correct_score, random_search.cv_results_["split%d_test_score" % i][cand_i], )
def test_trivial_cv_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV. clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1]}) grid_search.fit(X, y) assert hasattr(grid_search, "cv_results_") random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1) random_search.fit(X, y) assert hasattr(grid_search, "cv_results_")
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0) estimators = [ DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0) ] # Test with grid search cv for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) res_params = grid_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, grid_search.cv_results_['split%d_test_score' % i][cand_i]) # Test with a randomized search for est in estimators: random_search = dcv.RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) res_params = random_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, random_search.cv_results_['split%d_test_score' % i][cand_i])
def fit_knn(X, y, n_iter): """Fit a KNN model on geographical coordinates only""" columns_tf = make_column_transformer(("passthrough", ["X", "Y"])) model = make_pipeline(columns_tf, KNeighborsClassifier()) param_space = { "kneighborsclassifier__n_neighbors": loguniform_int(1, 500), "kneighborsclassifier__weights": ["uniform", "distance"], } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def test_search_cv_results_rank_tie_breaking(): X, y = make_blobs(n_samples=50, random_state=42) # The two C values are close enough to give similar models # which would result in a tie of their mean cv-scores param_grid = {"C": [1, 1.001, 0.001]} grid_search = dcv.GridSearchCV( SVC(gamma="auto"), param_grid=param_grid, return_train_score=True ) random_search = dcv.RandomizedSearchCV( SVC(gamma="auto"), n_iter=3, param_distributions=param_grid, return_train_score=True, ) for search in (grid_search, random_search): search.fit(X, y) cv_results = search.cv_results_ # Check tie breaking strategy - # Check that there is a tie in the mean scores between # candidates 1 and 2 alone assert_almost_equal( cv_results["mean_test_score"][0], cv_results["mean_test_score"][1] ) assert_almost_equal( cv_results["mean_train_score"][0], cv_results["mean_train_score"][1] ) try: assert_almost_equal( cv_results["mean_test_score"][1], cv_results["mean_test_score"][2] ) except AssertionError: pass try: assert_almost_equal( cv_results["mean_train_score"][1], cv_results["mean_train_score"][2] ) except AssertionError: pass # 'min' rank should be assigned to the tied candidates assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3])
def fit_linear(X, y, n_iter): """Fit a logistic regression model""" model = LogisticRegression(max_iter=500, penalty="elasticnet", solver="saga") model = make_pipeline(columns_transform(), model) param_space = { "logisticregression__l1_ratio": st.uniform(0, 1), "logisticregression__C": st.loguniform(1e-4, 1e4), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def fit_gbdt(X, y, n_iter): """Fit a gradient boosted decision trees model""" model = LGBMClassifier(n_estimators=2000, random_state=42) model = make_pipeline(columns_transform(), model) param_space = { "lgbmclassifier__min_data_in_leaf": loguniform_int(5, 500), "lgbmclassifier__num_leaves": loguniform_int(31, 500), "lgbmclassifier__reg_alpha": st.loguniform(1e-10, 1.0), "lgbmclassifier__reg_lambda": st.loguniform(1e-10, 1.0), "lgbmclassifier__learning_rate": st.loguniform(1e-4, 1e-1), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def fit_mlp(X, y, n_iter): """Fit a simple multi-layer perceptron model""" model = MLPClassifier(random_state=42, early_stopping=True) model = make_pipeline(columns_transform(), model) layers_options = [ [n_units] * n_layers for n_units, n_layers in it.product([32, 64, 128, 256, 512], [1, 2]) ] param_space = { "mlpclassifier__hidden_layer_sizes": layers_options, "mlpclassifier__alpha": st.loguniform(1e-5, 1e-2), "mlpclassifier__learning_rate_init": st.loguniform(1e-4, 1e-1), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def test_search_iid_param(): # Test the IID parameter # noise-free simple 2d-data X, y = make_blobs( centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, cluster_std=0.1, shuffle=False, n_samples=80, ) # split dataset into two folds that are not iid # first one contains data of all 4 blobs, second only from two. mask = np.ones(X.shape[0], dtype=np.bool) mask[np.where(y == 1)[0][::2]] = 0 mask[np.where(y == 2)[0][::2]] = 0 # this leads to perfect classification on one fold and a score of 1/3 on # the other # create "cv" for splits cv = [[mask, ~mask], [~mask, mask]] # once with iid=True (default) grid_search = dcv.GridSearchCV(SVC(gamma="auto"), param_grid={"C": [1, 10]}, cv=cv, return_train_score=True) random_search = dcv.RandomizedSearchCV( SVC(gamma="auto"), n_iter=2, param_distributions={"C": [1, 10]}, return_train_score=True, cv=cv, ) for search in (grid_search, random_search): search.fit(X, y) assert search.iid test_cv_scores = np.array( list(search.cv_results_["split%d_test_score" % s_i][0] for s_i in range(search.n_splits_))) train_cv_scores = np.array( list(search.cv_results_["split%d_train_" "score" % s_i][0] for s_i in range(search.n_splits_))) test_mean = search.cv_results_["mean_test_score"][0] test_std = search.cv_results_["std_test_score"][0] train_cv_scores = np.array( list(search.cv_results_["split%d_train_" "score" % s_i][0] for s_i in range(search.n_splits_))) train_mean = search.cv_results_["mean_train_score"][0] train_std = search.cv_results_["std_train_score"][0] # Test the first candidate assert search.cv_results_["param_C"][0] == 1 assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0]) assert_array_almost_equal(train_cv_scores, [1, 1]) # for first split, 1/4 of dataset is in test, for second 3/4. # take weighted average and weighted std expected_test_mean = 1 * 1.0 / 4.0 + 1.0 / 3.0 * 3.0 / 4.0 expected_test_std = np.sqrt(1.0 / 4 * (expected_test_mean - 1)**2 + 3.0 / 4 * (expected_test_mean - 1.0 / 3.0)**2) assert_almost_equal(test_mean, expected_test_mean) assert_almost_equal(test_std, expected_test_std) # For the train scores, we do not take a weighted mean irrespective of # i.i.d. or not assert_almost_equal(train_mean, 1) assert_almost_equal(train_std, 0) # once with iid=False grid_search = dcv.GridSearchCV( SVC(gamma="auto"), param_grid={"C": [1, 10]}, cv=cv, iid=False, return_train_score=True, ) random_search = dcv.RandomizedSearchCV( SVC(gamma="auto"), n_iter=2, param_distributions={"C": [1, 10]}, cv=cv, iid=False, return_train_score=True, ) for search in (grid_search, random_search): search.fit(X, y) assert not search.iid test_cv_scores = np.array( list(search.cv_results_["split%d_test_score" % s][0] for s in range(search.n_splits_))) test_mean = search.cv_results_["mean_test_score"][0] test_std = search.cv_results_["std_test_score"][0] train_cv_scores = np.array( list(search.cv_results_["split%d_train_" "score" % s][0] for s in range(search.n_splits_))) train_mean = search.cv_results_["mean_train_score"][0] train_std = search.cv_results_["std_train_score"][0] assert search.cv_results_["param_C"][0] == 1 # scores are the same as above assert_array_almost_equal(test_cv_scores, [1, 1.0 / 3.0]) # Unweighted mean/std is used assert_almost_equal(test_mean, np.mean(test_cv_scores)) assert_almost_equal(test_std, np.std(test_cv_scores)) # For the train scores, we do not take a weighted mean irrespective of # i.i.d. or not assert_almost_equal(train_mean, 1) assert_almost_equal(train_std, 0)
'gbc__min_samples_leaf': min_samples_leafs, } # create model gbc = Pipeline([('vect', CountVectorizer(analyzer=lambda x: x)), ('scale', MaxAbsScaler()), ('gbc', GradientBoostingClassifier(verbose=1, random_state=7))]) print("Tuning Model") # Cross validation clf = dcv.RandomizedSearchCV(gbc, tuned_parameters, cv=5, n_iter=n_iter, refit=True, scoring='accuracy', cache_cv=True, scheduler=client) clf.fit(x_train, y_train) # Print results for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']): print(param, score) print("the best model is" + str(clf.best_params_)) score = clf.best_estimator_.score(x_test, y_test)