def test_is_classifier(): svc = SVC() assert is_classifier(svc) assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})) assert is_classifier(Pipeline([('svc', svc)])) assert is_classifier( Pipeline([('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert best_C in Cs
def test_ecoc_gridsearch(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ecoc, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert best_C in Cs
def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert gs.best_estimator_.alpha == ridge_cv.alpha_
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_gridsearch(): """Check GridSearch support.""" clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') params = {'lr__C': [1.0, 100.0], 'voting': ['soft', 'hard'], 'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]} grid = GridSearchCV(estimator=eclf, param_grid=params) grid.fit(iris.data, iris.target)
def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5) param = {'loss': ('hinge', 'log', 'modified_huber')} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "The base estimator should implement predict_proba method" with pytest.raises(ValueError, match=err_msg): multi_target_linear.predict_proba(X)
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} gs = GridSearchCV(Ridge(), parameters, cv=cv) gs.fit(X, y, sample_weight=sample_weight) assert ridgecv.alpha_ == gs.best_estimator_.alpha assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def test_raises_on_score_list(): # Test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) f1_scorer_no_average = make_scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y)
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = { 'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R') } clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def test_set_params_passes_all_parameters(): # Make sure all parameters are passed together to set_params # of nested estimator. Regression test for #9944 class TestDecisionTree(DecisionTreeClassifier): def set_params(self, **kwargs): super().set_params(**kwargs) # expected_kwargs is in test scope assert kwargs == expected_kwargs return self expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2} for est in [ Pipeline([('estimator', TestDecisionTree())]), GridSearchCV(TestDecisionTree(), {}) ]: est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3) scorer = check_scoring(grid, "f1") assert isinstance(scorer, _PredictScorer) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, "f1") assert isinstance(scorer, _PredictScorer) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3) assert_array_equal(scores, 1)
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert not hasattr(decision_only, 'predict_proba') # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) assert not hasattr(decision_only, 'predict_proba') decision_only.fit(X_train, Y_train) assert not hasattr(decision_only, 'predict_proba') assert hasattr(decision_only, 'decision_function') # Estimator which can get predict_proba enabled after fitting gs = GridSearchCV(svm.SVC(probability=False), param_grid={'probability': [True]}) proba_after_fit = OneVsRestClassifier(gs) assert not hasattr(proba_after_fit, 'predict_proba') proba_after_fit.fit(X_train, Y_train) assert hasattr(proba_after_fit, 'predict_proba') Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def __init__(self, name, construct, skip_methods=(), fit_args=make_classification()): self.name = name self.construct = construct self.fit_args = fit_args self.skip_methods = skip_methods DELEGATING_METAESTIMATORS = [ DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), DelegatorData( 'GridSearchCV', lambda est: GridSearchCV(est, param_grid={'param': [5]}, cv=2), skip_methods=['score']), DelegatorData('RandomizedSearchCV', lambda est: RandomizedSearchCV( est, param_distributions={'param': [5]}, cv=2, n_iter=1), skip_methods=['score']), DelegatorData('RFE', RFE, skip_methods=['transform', 'inverse_transform']), DelegatorData('RFECV', RFECV, skip_methods=['transform', 'inverse_transform']), DelegatorData('BaggingClassifier', BaggingClassifier, skip_methods=[ 'transform', 'inverse_transform', 'score', 'predict_proba', 'predict_log_proba', 'predict' ])
C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, ] reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid) X, y = load_digits(return_X_y=True) grid.fit(X, y) mean_scores = np.array(grid.cv_results_['mean_test_score']) # scores are in the order of param_grid iteration, which is alphabetical mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS)) # select score for best C mean_scores = mean_scores.max(axis=0) bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + .5) plt.figure() COLORS = 'bgrcmyk' for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)): plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
X, y = iris.data, iris.target # This dataset is way too high-dimensional. Better do PCA: pca = PCA(n_components=2) # Maybe some original features where good, too? selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) print("Combined space has", X_features.shape[1], "features") svm = SVC(kernel="linear") # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2], svm__C=[0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) grid_search.fit(X, y) print(grid_search.best_estimator_)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV( SVC(), tuned_parameters, scoring='%s_macro' % score ) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print()
"min_samples_split": sp_randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params']))) report(grid_search.cv_results_)
# Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### # Using the prediction pipeline in a grid search ############################################################################### # Grid search can also be performed on the different preprocessing steps # defined in the ``ColumnTransformer`` object, together with the classifier's # hyperparameters as part of the ``Pipeline``. # We will search for both the imputer strategy of the numeric preprocessing # and the regularization parameter of the logistic regression using # :class:`mrex.model_selection.GridSearchCV`. param_grid = { 'preprocessor__num__imputer__strategy': ['mean', 'median'], 'classifier__C': [0.1, 1.0, 10, 100], } grid_search = GridSearchCV(clf, param_grid, cv=10) grid_search.fit(X_train, y_train) print(("best logistic regression from grid search: %.3f" % grid_search.score(X_test, y_test)))
def test_set_params_updates_valid_params(): # Check that set_params tries to set SVC().C, not # DecisionTreeClassifier().C gscv = GridSearchCV(DecisionTreeClassifier(), {}) gscv.set_params(estimator=SVC(), estimator__C=42.0) assert gscv.estimator.C == 42.0
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'clf__max_iter': (20, ), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__max_iter': (10, 50, 80), } if __name__ == "__main__": # multiprocessing requires the fork to happen in a __main__ protected # block # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(data.data, data.target) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def check_gridsearch(name): forest = FOREST_CLASSIFIERS[name]() clf = GridSearchCV(forest, {'n_estimators': (1, 2), 'max_depth': (1, 2)}) clf.fit(iris.data, iris.target)
from mrex.datasets import load_digits from mrex.neighbors import KernelDensity from mrex.decomposition import PCA from mrex.model_selection import GridSearchCV # load the data digits = load_digits() # project the 64-dimensional data to a lower dimension pca = PCA(n_components=15, whiten=False) data = pca.fit_transform(digits.data) # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ # sample 44 new points from the data new_data = kde.sample(44, random_state=0) new_data = pca.inverse_transform(new_data) # turn data into a 4x11 grid new_data = new_data.reshape((4, 11, -1)) real_data = digits.data[:44].reshape((4, 11, -1))
# ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(location=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) coef_selection_ = coef_.reshape(size, size)
colors = ['navy', 'cyan', 'darkorange'] lw = 2 for clf, cs, X, y in clf_sets: # set up the plot for each regressor fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): param_grid = dict(C=cs) # To get nice curve, we need a large number of iterations to # reduce the variance grid = GridSearchCV(clf, refit=False, param_grid=param_grid, cv=ShuffleSplit(train_size=train_size, test_size=.3, n_splits=250, random_state=1)) grid.fit(X, y) scores = grid.cv_results_['mean_test_score'] scales = [ (1, 'No scaling'), ((n_samples * train_size), '1/n_samples'), ] for ax, (scaler, name) in zip(axes, scales): ax.set_xlabel('C') ax.set_ylabel('CV Score') grid_cs = cs * float(scaler) # scale the C's
from mrex.linear_model import LassoCV from mrex.linear_model import Lasso from mrex.model_selection import KFold from mrex.model_selection import GridSearchCV X, y = datasets.load_diabetes(return_X_y=True) X = X[:150] y = y[:150] lasso = Lasso(random_state=0, max_iter=10000) alphas = np.logspace(-4, -0.5, 30) tuned_parameters = [{'alpha': alphas}] n_folds = 5 clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False) clf.fit(X, y) scores = clf.cv_results_['mean_test_score'] scores_std = clf.cv_results_['std_test_score'] plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores) # plot error lines showing +/- std. errors of the scores std_error = scores_std / np.sqrt(n_folds) plt.semilogx(alphas, scores + std_error, 'b--') plt.semilogx(alphas, scores - std_error, 'b--') # alpha=0.2 controls the translucency of the fill color plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)
# ############################################################################# # Generate sample data X = 5 * rng.rand(10000, 1) y = np.sin(X).ravel() # Add noise to targets y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) X_plot = np.linspace(0, 5, 100000)[:, None] # ############################################################################# # Fit regression model train_size = 100 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), param_grid={ "C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5) }) kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), param_grid={ "alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5) }) t0 = time.time() svr.fit(X[:train_size], y[:train_size]) svr_fit = time.time() - t0 print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) # iterate over classifiers for est_idx, (name, (estimator, param_grid)) in \ enumerate(zip(names, classifiers)): ax = axes[ds_cnt, est_idx + 1] clf = GridSearchCV(estimator=estimator, param_grid=param_grid) with ignore_warnings(category=ConvergenceWarning): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print('%s: %.2f' % (name, score)) # plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]*[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)