def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def test_grid_search_error(): """Test that grid search will capture errors on data with different length""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_[:180], y_)
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) # make sure it selects the smallest parameter in case of ties assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert cross_validation.grid_scores_[i][0] == {'foo_param': foo_i}
def test_grid_search_sparse(): """Test that grid search works with both dense and sparse matrices""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert np.mean(y_pred == y_pred2) >= .9 assert_equal(C, C2)
def test_grid_search_sparse_score_func(): X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def do_grid_search(X, Y, gs_params): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ lrpipe = Pipeline([('logreg', LogisticRegression())]) gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
def ParameterGridSearch(self, callback=None, nValidation=5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = { 'C': 2**np.arange(-5, 11, 2, dtype=float), 'gamma': 2**np.arange(3, -11, -2, dtype=float) } clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s' % (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma
def do_grid_search(X, Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([('rbfsvm', SVC())]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5), } gs = GridSearchCV(svpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names)
y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted)
digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ################################################################################ # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ################################################################################ # Prediction scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1) from scikits.learn.grid_search import GridSearchCV n_components = [10, 15, 20, 30, 40, 50, 64] Cs = np.logspace(-4, 4, 16) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs), n_jobs=-1) estimator.fit(X_digits, y_digits)
tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) parameters = {'ward__n_clusters': [10, 20, 30]} # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, parameters, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) parameters = {'anova__percentile': [5, 10, 20]} # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, parameters) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
from preprocess import InfinitivesExtractor, load_data # Data attributes targets = [0, 1, 2] target_names = ["covered", "no alternance", "uncovered"] target_colors = "rgb" # Classification settings pipeline = Pipeline([('extr', InfinitivesExtractor()), ('svc', LinearSVC(multi_class=True))]) parameters = { 'extr__count': (True, False), 'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator print clf yp = clf.predict(X) print classification_report(y, yp, targets, target_names) #pl.figure() #pl.title("Classification rate for 3-fold stratified CV")
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)