def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert best_C in Cs
def test_ecoc_gridsearch(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ecoc, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert best_C in Cs
def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert gs.best_estimator_.alpha == ridge_cv.alpha_
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_gridsearch(): """Check GridSearch support.""" clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') params = {'lr__C': [1.0, 100.0], 'voting': ['soft', 'hard'], 'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]} grid = GridSearchCV(estimator=eclf, param_grid=params) grid.fit(iris.data, iris.target)
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = { 'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R') } clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} gs = GridSearchCV(Ridge(), parameters, cv=cv) gs.fit(X, y, sample_weight=sample_weight) assert ridgecv.alpha_ == gs.best_estimator_.alpha assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
for clf, cs, X, y in clf_sets: # set up the plot for each regressor fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): param_grid = dict(C=cs) # To get nice curve, we need a large number of iterations to # reduce the variance grid = GridSearchCV(clf, refit=False, param_grid=param_grid, cv=ShuffleSplit(train_size=train_size, test_size=.3, n_splits=250, random_state=1)) grid.fit(X, y) scores = grid.cv_results_['mean_test_score'] scales = [ (1, 'No scaling'), ((n_samples * train_size), '1/n_samples'), ] for ax, (scaler, name) in zip(axes, scales): ax.set_xlabel('C') ax.set_ylabel('CV Score') grid_cs = cs * float(scaler) # scale the C's ax.semilogx(grid_cs, scores, label="fraction %.2f" % train_size, color=colors[k],
# Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### # Using the prediction pipeline in a grid search ############################################################################### # Grid search can also be performed on the different preprocessing steps # defined in the ``ColumnTransformer`` object, together with the classifier's # hyperparameters as part of the ``Pipeline``. # We will search for both the imputer strategy of the numeric preprocessing # and the regularization parameter of the logistic regression using # :class:`mrex.model_selection.GridSearchCV`. param_grid = { 'preprocessor__num__imputer__strategy': ['mean', 'median'], 'classifier__C': [0.1, 1.0, 10, 100], } grid_search = GridSearchCV(clf, param_grid, cv=10) grid_search.fit(X_train, y_train) print(("best logistic regression from grid search: %.3f" % grid_search.score(X_test, y_test)))
from mrex.datasets import load_digits from mrex.neighbors import KernelDensity from mrex.decomposition import PCA from mrex.model_selection import GridSearchCV # load the data digits = load_digits() # project the 64-dimensional data to a lower dimension pca = PCA(n_components=15, whiten=False) data = pca.fit_transform(digits.data) # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ # sample 44 new points from the data new_data = kde.sample(44, random_state=0) new_data = pca.inverse_transform(new_data) # turn data into a 4x11 grid new_data = new_data.reshape((4, 11, -1)) real_data = digits.data[:44].reshape((4, 11, -1)) # plot real digits and resampled digits
def check_gridsearch(name): forest = FOREST_CLASSIFIERS[name]() clf = GridSearchCV(forest, {'n_estimators': (1, 2), 'max_depth': (1, 2)}) clf.fit(iris.data, iris.target)
# TASK: Build a vectorizer / classifier pipeline that filters out tokens # that are too rare or too frequent pipeline = Pipeline([ ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), ('clf', LinearSVC(C=1000)), ]) # TASK: Build a grid search to find out whether unigrams or bigrams are # more useful. # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train, y_train) # TASK: print the mean and std for each candidate along with the parameter # settings for all the candidates explored by grid search. n_candidates = len(grid_search.cv_results_['params']) for i in range(n_candidates): print(i, 'params - %s; mean - %0.2f; std - %0.2f' % (grid_search.cv_results_['params'][i], grid_search.cv_results_['mean_test_score'][i], grid_search.cv_results_['std_test_score'][i])) # TASK: Predict the outcome on the testing set and store it in a variable # named y_predicted y_predicted = grid_search.predict(docs_test) # Print the classification report
from mrex.linear_model import Lasso from mrex.model_selection import KFold from mrex.model_selection import GridSearchCV X, y = datasets.load_diabetes(return_X_y=True) X = X[:150] y = y[:150] lasso = Lasso(random_state=0, max_iter=10000) alphas = np.logspace(-4, -0.5, 30) tuned_parameters = [{'alpha': alphas}] n_folds = 5 clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False) clf.fit(X, y) scores = clf.cv_results_['mean_test_score'] scores_std = clf.cv_results_['std_test_score'] plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores) # plot error lines showing +/- std. errors of the scores std_error = scores_std / np.sqrt(n_folds) plt.semilogx(alphas, scores + std_error, 'b--') plt.semilogx(alphas, scores - std_error, 'b--') # alpha=0.2 controls the translucency of the fill color plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2) plt.ylabel('CV score +/- std error')
rng = np.random.RandomState(0) # Generate sample data X = 15 * rng.rand(100, 1) y = np.sin(X).ravel() y += 3 * (0.5 - rng.rand(X.shape[0])) # add noise # Fit KernelRidge with parameter selection based on 5-fold cross validation param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3], "kernel": [ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10) for p in np.logspace(0, 2, 10)]} kr = GridSearchCV(KernelRidge(), param_grid=param_grid) stime = time.time() kr.fit(X, y) print("Time for KRR fitting: %.3f" % (time.time() - stime)) gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \ + WhiteKernel(1e-1) gpr = GaussianProcessRegressor(kernel=gp_kernel) stime = time.time() gpr.fit(X, y) print("Time for GPR fitting: %.3f" % (time.time() - stime)) # Predict using kernel ridge X_plot = np.linspace(0, 20, 10000)[:, None] stime = time.time() y_kr = kr.predict(X_plot) print("Time for KRR prediction: %.3f" % (time.time() - stime))
# Arrays to store scores non_nested_scores = np.zeros(NUM_TRIALS) nested_scores = np.zeros(NUM_TRIALS) # Loop for each trial for i in range(NUM_TRIALS): # Choose cross-validation techniques for the inner and outer loops, # independently of the dataset. # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc. inner_cv = KFold(n_splits=4, shuffle=True, random_state=i) outer_cv = KFold(n_splits=4, shuffle=True, random_state=i) # Non_nested parameter search and scoring clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv) clf.fit(X_iris, y_iris) non_nested_scores[i] = clf.best_score_ # Nested CV with parameter optimization nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv) nested_scores[i] = nested_score.mean() score_difference = non_nested_scores - nested_scores print("Average difference of {:6f} with std. dev. of {:6f}.".format( score_difference.mean(), score_difference.std())) # Plot scores on each trial for nested and non-nested CV plt.figure() plt.subplot(211) non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
# 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'clf__max_iter': (20, ), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__max_iter': (10, 50, 80), } if __name__ == "__main__": # multiprocessing requires the fork to happen in a __main__ protected # block # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(data.data, data.target) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) # ############################################################################# # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV( SVC(kernel='rbf', class_weight='balanced'), param_grid ) clf = clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(clf.best_estimator_) # ############################################################################# # Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("done in %0.3fs" % (time() - t0)) print(classification_report(y_test, y_pred, target_names=target_names)) print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
# The scorers can be either be one of the predefined metric strings or a scorer # callable, like the one returned by make_scorer scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)} # Setting refit='AUC', refits an estimator on the whole dataset with the # parameter setting that has the best cross-validated AUC score. # That estimator is made available at ``gs.best_estimator_`` along with # parameters like ``gs.best_score_``, ``gs.best_params_`` and # ``gs.best_index_`` gs = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid={'min_samples_split': range(2, 403, 10)}, scoring=scoring, refit='AUC', return_train_score=True) gs.fit(X, y) results = gs.cv_results_ ############################################################################### # Plotting the result # ------------------- plt.figure(figsize=(13, 13)) plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16) plt.xlabel("min_samples_split") plt.ylabel("Score") ax = plt.gca() ax.set_xlim(0, 402)
"min_samples_split": sp_randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params']))) report(grid_search.cv_results_)
# Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV( SVC(), tuned_parameters, scoring='%s_macro' % score ) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:")
# Fit regression model train_size = 100 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), param_grid={ "C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5) }) kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), param_grid={ "alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5) }) t0 = time.time() svr.fit(X[:train_size], y[:train_size]) svr_fit = time.time() - t0 print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit) t0 = time.time() kr.fit(X[:train_size], y[:train_size]) kr_fit = time.time() - t0 print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit) sv_ratio = svr.best_estimator_.support_.shape[0] / train_size print("Support vector ratio: %.3f" % sv_ratio) t0 = time.time() y_svr = svr.predict(X_plot)
for s in shrinkages ] # under the ground-truth model, which we would not have access to in real # settings real_cov = np.dot(coloring_matrix.T, coloring_matrix) emp_cov = empirical_covariance(X_train) loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov)) # ############################################################################# # Compare different approaches to setting the parameter # GridSearch for an optimal shrinkage coefficient tuned_parameters = [{'shrinkage': shrinkages}] cv = GridSearchCV(ShrunkCovariance(), tuned_parameters) cv.fit(X_train) # Ledoit-Wolf optimal shrinkage coefficient estimate lw = LedoitWolf() loglik_lw = lw.fit(X_train).score(X_test) # OAS coefficient estimate oa = OAS() loglik_oa = oa.fit(X_train).score(X_test) # ############################################################################# # Plot results fig = plt.figure() plt.title("Regularized covariance: likelihood and shrinkage coefficient") plt.xlabel('Regularization parameter: shrinkage coefficient') plt.ylabel('Error: negative log-likelihood on test data')