def test2(): parameters = {'kernel': ['cat', ['rbf', 'poly']], 'd': ['int', [1, 3]], 'C': ['float', [1, 10]]} def scoring_function(x): return 0.5 search = GPSearchCV(parameters, estimator=scoring_function, n_iter=20) search.fit(X=data.data, y=data.target)
def test_n_iter_smaller_n_iter(): # failing test that happens when n_iter < n_init. iris = load_iris() X, y = iris.data, iris.target parameters = {"max_depth": ['int', [3, 3]], "max_features": ['int', [1, 4]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]]} clf = RandomForestClassifier() grid_search = GPSearchCV(clf, parameters, n_iter=5, n_init=20) grid_search.fit(X, y)
def test1(): iris = load_digits() X, y = iris.data, iris.target clf = RandomForestClassifier(n_estimators=20) # specify parameters and distributions to sample from parameters = {"max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]]} search = GPSearchCV(clf, parameters, n_iter=20) search.fit(X, y) print(search) print(search.best_parameter_) print(search.best_estimator_)
def test3(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories # categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': ['float', [0.000001, 0.00001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] # 'clf__n_iter': (10, 50, 80), } search = GPSearchCV(pipeline, parameters, n_iter=20) search.fit(X=data.data, y=data.target)
def test_grid_search_no_score(): # Test grid-search on classifier that has no score function. clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] clf_no_score = LinearSVCNoScore(random_state=0) gp_search = GPSearchCV(clf, {'C': Cs}, scoring='accuracy') gp_search.fit(X, y) grid_search_no_score = GPSearchCV(clf_no_score, {'C': Cs}, scoring='accuracy') # smoketest grid search grid_search_no_score.fit(X, y) # check that best params are equal assert_equal(grid_search_no_score.best_params_, gp_search.best_params_) # check that we can call score and that it gives the correct result assert_equal(gp_search.score(X, y), grid_search_no_score.score(X, y)) # giving no scoring function raises an error grid_search_no_score = GPSearchCV(clf_no_score, {'C': Cs}) assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit, [[1]])
def test_gp_search(): clf = MockClassifier() gp_search = GPSearchCV(clf, {'foo_param': ['int', [1, 3]]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() gp_search.fit(X, y) sys.stdout = old_stdout assert_equal(gp_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert_true(gp_search.scores_[i][0] == {'foo_param': foo_i}) # Smoke test the score etc: gp_search.score(X, y) gp_search.predict_proba(X) gp_search.decision_function(X) gp_search.transform(X) # Test exception handling on scoring gp_search.scoring = 'sklearn' assert_raises(ValueError, gp_search.fit, X, y)
# Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = {'gamma': ['float', [1e-3, 1e-4]], 'C': ['float', [1, 1000]]} scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GPSearchCV(SVC(C=1), tuned_parameters, cv=5, verbose=True) clf.fit(X, y) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print()
def test_gp_search(): # Test that the best estimator contains the right value for foo_param clf = MockDiscreteClassifier() gp_search = GPSearchCV(clf, {'foo_param': ['int', [1, 3]]}) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() gp_search.fit(X, y) sys.stdout = old_stdout assert_equal(gp_search.best_estimator_.foo_param, 2) clf = MockContinuousClassifier() gp_search = GPSearchCV(clf, {'foo_param': ['float', [-3, 3]]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() gp_search.fit(X, y) sys.stdout = old_stdout assert_almost_equal(gp_search.best_estimator_.foo_param, 0, decimal=1) # Smoke test the score etc: gp_search.score(X, y) gp_search.predict_proba(X) gp_search.decision_function(X) gp_search.transform(X) # Test exception handling on scoring gp_search.scoring = 'sklearn' assert_raises(ValueError, gp_search.fit, X, y)
def test_no_refit(): # Test that grid search can be used for model selection only clf = MockDiscreteClassifier() grid_search = GPSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) grid_search.fit(X, y) assert_true(hasattr(grid_search, "best_params_"))
def gp_vs_random_search(test_name, n_tests, search_length, save_data=False): """ Compare GP-based search vs a simple random one Choose test_name in {'iris', 'text'} """ n_iter_search = search_length if(test_name == 'iris'): iris = load_digits() X, y = iris.data, iris.target pipeline = RandomForestClassifier() # specify parameters and distributions to sample from parameters = {"max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]]} elif(test_name == 'text'): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories # categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) X = data.data y = data.target # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': ['float', [0.000001, 0.00001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] # 'clf__n_iter': (10, 50, 80), } else: print('Dataset not available for test') # GP UCB search all_gp_ucb_results = [] print('GP_ucb search') for i in range(n_tests): ucb_search = GPSearchCV(pipeline, parameters, acquisition_function='UCB', n_iter=n_iter_search, n_init=20, verbose=False) _, scores = ucb_search.fit(X=data.data, y=data.target) max_scores = [scores[0]] print('Test', i, '-', len(scores), 'parameters tested') for j in range(1, len(scores)): max_scores.append(max(max_scores[j-1], scores[j])) all_gp_ucb_results.append(extend_result(n_iter_search, max_scores)) all_gp_ucb_results = np.asarray(all_gp_ucb_results) print(all_gp_ucb_results.shape) if(save_data): np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',') # # GP EI search # all_gp_ei_results = [] # print('GP_ei search') # for i in range(n_tests): # ei_search = GPSearchCV(parameters,estimator=pipeline, # acquisition_function='EI', # n_iter=n_iter_search, n_init=20, verbose=False) # _,scores = ei_search.fit(X=data.data, y=data.target) # max_scores = [scores[0]] # print('Test',i,'-',len(scores),'parameters tested') # for j in range(1,len(scores)): # max_scores.append(max(max_scores[j-1],scores[j])) # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) # all_gp_ei_results = np.asarray(all_gp_ei_results) # print(all_gp_ei_results.shape) # if(save_data): # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') # Randomized search print('Random search') all_random_results = [] for i in range(n_tests): random_search = GPSearchCV(parameters, estimator=pipeline, n_iter=n_iter_search, n_init=n_iter_search, verbose=False) _, scores = random_search.fit(X=data.data, y=data.target) max_scores = [scores[0]] print('Test', i, '-', len(scores), 'parameters tested') for j in range(1, len(scores)): max_scores.append(max(max_scores[j-1], scores[j])) all_random_results.append(extend_result(n_iter_search, max_scores)) all_random_results = np.asarray(all_random_results) if(save_data): np.savetxt('rand_scores.csv', all_random_results, delimiter=',') plt.figure() # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB') plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random') plt.legend(loc=4) plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials') plt.xlabel('Iterations') plt.ylabel('Max CV performance') plt.show()