def test_search(self): conn = sigopt.Connection() n_iter = 5 folds = 3 cv = SigOptSearchCV( estimator=GradientBoostingClassifier(), param_domains=GradientBoostingClassifier_PARAM_DOMAIN, client_token='client_token', n_iter=n_iter, cv=folds) assert len(conn.experiments().create.mock_calls) == 0 assert len(conn.experiments().fetch.mock_calls) == 0 assert len(conn.experiments().suggestions.create.mock_calls) == 0 assert len(conn.experiments().observations.create.mock_calls) == 0 data = sklearn.datasets.load_iris() cv.fit(data['data'], data['target']) assert len(conn.experiments().create.mock_calls) == 1 create_definition = conn.experiments().create.call_args[1] assert create_definition[ 'name'] == GradientBoostingClassifier_EXPERIMENT_DEF['name'] assert len(create_definition['parameters']) == len( GradientBoostingClassifier_EXPERIMENT_DEF['parameters']) for p in GradientBoostingClassifier_EXPERIMENT_DEF['parameters']: assert p in create_definition['parameters'] assert len(conn.experiments().best_assignments().fetch.mock_calls) == 1 assert len(conn.experiments().suggestions().create.mock_calls ) == n_iter * folds assert len(conn.experiments().observations().create.mock_calls ) == n_iter * folds assert cv.best_params_ == zero_corner( GradientBoostingClassifier_EXPERIMENT_DEF)
def test_non_string_categorical(self): data = sklearn.datasets.load_iris() clf = SigOptSearchCV(SVC(), SVC_PARAM_DOMAIN, client_token='client_token', n_iter=5) clf.fit(data['data'], data['target'])
def test_search(self): conn = sigopt.Connection() n_iter = 5 folds = 3 cv = SigOptSearchCV( estimator=GradientBoostingClassifier(), param_domains=GradientBoostingClassifier_PARAM_DOMAIN, client_token='client_token', n_iter=n_iter, cv=folds ) assert len(conn.experiments().create.mock_calls) == 0 assert len(conn.experiments().fetch.mock_calls) == 0 assert len(conn.experiments().suggestions.create.mock_calls) == 0 assert len(conn.experiments().observations.create.mock_calls) == 0 data = sklearn.datasets.load_iris() cv.fit(data['data'], data['target']) assert len(conn.experiments().create.mock_calls) == 1 create_definition = conn.experiments().create.call_args[1] assert create_definition['name'] == GradientBoostingClassifier_EXPERIMENT_DEF['name'] assert len(create_definition['parameters']) == len(GradientBoostingClassifier_EXPERIMENT_DEF['parameters']) for p in GradientBoostingClassifier_EXPERIMENT_DEF['parameters']: assert p in create_definition['parameters'] assert len(conn.experiments().best_assignments().fetch.mock_calls) == 1 assert len(conn.experiments().suggestions().create.mock_calls) == n_iter assert len(conn.experiments().observations().create.mock_calls) == n_iter assert cv.best_params_ == zero_corner(GradientBoostingClassifier_EXPERIMENT_DEF)
def test_bad_param_range_not_iterable(self): with pytest.raises(Exception): clf = SigOptSearchCV( SVC(), {'max_iter': 15}, client_token='client_token', n_iter=5 ) clf._transform_param_domains(clf.param_domains)
def test_warn_param_range_list(self): with pytest.warns(UserWarning): clf = SigOptSearchCV( SVC(), {'max_iter': [5, 10]}, client_token='client_token', n_iter=5 ) clf._transform_param_domains(clf.param_domains)
def test_bad_param_range2(self): with pytest.raises(Exception): clf = SigOptSearchCV(SVC(), { 'bad_param_range': (1, 2, 3), 'hidden_layer_sizes': { '5': (5, ), '5,4,3': (5, 4, 3) } }, client_token='client_token', n_iter=5) clf._transform_param_domains(clf.param_domains)
def test_bad_param_range2(self): with pytest.raises(Exception): clf = SigOptSearchCV( SVC(), { 'bad_param_range': (1, 2, 3), 'hidden_layer_sizes': {'5': (5,), '5,4,3': (5, 4, 3)} }, client_token='client_token', n_iter=5 ) clf._transform_param_domains(clf.param_domains)
# Define domains for the Random Forest parameters random_forest_parameters = dict( max_features=[1, 128], n_estimators=[1, 100], min_samples_leaf=[1, 10], ) # define sklearn estimator random_forest = RandomForestClassifier() # define SigOptCV search strategy clf = SigOptSearchCV( random_forest, random_forest_parameters, cv=5, client_token=client_token, n_iter=60 ) time1 = time() clf.fit(X_train, y_train) time2 = time() # Prediction y_pred_train = clfSVM.predict(X_train) # Compute the score score = compute_pred_score(y_train, y_pred_train) print('Score sur le train : %s' % score)
def main(): # convert arg structure to regular dict args = vars(parse_args()) X_path = args['X_file'] y_path = args['y_file'] client_token = args['client_token'] estimator_name = args['estimator'] output_path = args['output_file'] opt_timeout = args['opt_timeout'] with open(X_path, 'rb') as infile: X = pickle.load(infile) with open(y_path, 'rb') as infile: y = pickle.load(infile) # define param doimains for all esimators rf_params = { 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 20], 'criterion': ['gini', 'entropy'], 'n_estimators': [10, 100], } svm_params = { 'degree': [2, 4], '__log__C': [math.log(0.00001), math.log(1.0)], 'gamma': [0.0, 1.0] } knn_params = { 'n_neighbors': [2, 10], 'algorithm': ['ball_tree', 'kd_tree'], 'leaf_size': [10, 50], 'p': [1, 3] } sgd_params = { '__log__alpha': [math.log(0.00001), math.log(10.0)], 'l1_ratio': [0.0, 1.0], 'loss': ['log', 'modified_huber'] } xgb_params = { '__log__learning_rate': [math.log(0.0001), math.log(0.5)], 'n_estimators': [10, 100], 'max_depth': [3, 10], 'min_child_weight': [6, 12], 'gamma': [0, 0.5], 'subsample': [0.6, 1.0], 'colsample_bytree': [0.6, 1.0], } lda_params = {"__log__tol": [math.log(0.00001), math.log(0.5)]} qda_params = {"__log__tol": [math.log(0.00001), math.log(0.5)]} # mapping from classifier name to estimaitor object and domain # dict stores : (estimator, hyperparams, sparse_support) estname_2_args = { "GaussianNBClassifier": (GaussianNB(), None, False), "SVMClassifier": (SVC(probability=True), svm_params, True), "RandomForestClassifier": (RandomForestClassifier(n_jobs=2), rf_params, True), "SGDClassifier": (SGDClassifier(penalty='elasticnet'), sgd_params, True), "XGBClassifier": (XGBClassifier(nthread=2), xgb_params, True), "KNNClassifier": (KNeighborsClassifier(n_jobs=2), knn_params, False), "LDAClassifier": (LinearDiscriminantAnalysis(), lda_params, False), "QDAClassifier": (QuadraticDiscriminantAnalysis(), qda_params, False), } est, est_params, est_handle_sparse = estname_2_args[estimator_name] # check that estimator can handle sparse matrices if scipy.sparse.issparse(X) and not est_handle_sparse: raise Exception( '{} does not support sparse matrices.'.format(estimator_name)) elif est_params is not None: # fit the estimator if it has params to tune n_iter = max(10 * len(est_params), 20) clf = SigOptSearchCV( est, est_params, cv=3, opt_timeout=opt_timeout, client_token=client_token, n_jobs=3, n_iter=n_iter, ) else: clf = est clf.fit(X, y) if hasattr(clf, 'best_estimator_'): clf = clf.best_estimator_ # store classifier in specified output file with open(output_path, 'wb') as outfile: pickle.dump(clf, outfile, pickle.HIGHEST_PROTOCOL)
def test_no_token(self): with pytest.raises(ValueError): SigOptSearchCV( estimator=GradientBoostingClassifier, param_domains=GradientBoostingClassifier_PARAM_DOMAIN)
def test_create(self): SigOptSearchCV(estimator=GradientBoostingClassifier, param_domains=GradientBoostingClassifier_PARAM_DOMAIN, client_token='client_token')
iris = datasets.load_iris() X = iris.data y = iris.target # Define domains for the Random Forest parameters random_forest_parameters = dict( max_features=(1, len(iris) - 2), n_estimators=(1, 100), min_samples_leaf=(1, 10), ) # define sklearn estimator random_forest = RandomForestClassifier() # define SigOptCV search strategy clf = SigOptSearchCV( random_forest, random_forest_parameters, cv=5, client_token=client_token, n_iter=60 ) # perform CV search for best parameters and fits estimator # on all data using best found configuration clf.fit(X, y) # clf.predict() now uses best found estimator # clf.best_score_ contains CV score for best found estimator # clf.best_params_ contains best found param configuration
iris = datasets.load_iris() X = iris.data y = iris.target # Define domains for the Random Forest parameters random_forest_parameters = dict( max_features=[1, len(iris) - 1], n_estimators=[1, 100], min_samples_leaf=[1, 10], ) # define sklearn estimator random_forest = RandomForestClassifier() # define SigOptCV search strategy clf = SigOptSearchCV( random_forest, random_forest_parameters, cv=5, client_token=client_token, n_iter=60 ) # perform CV search for best parameters and fits estimator # on all data using best found configuration clf.fit(X, y) # clf.predict() now uses best found estimator # clf.best_score_ contains CV score for best found estimator # clf.best_params_ contains best found param configuration
def test_non_string_categorical(self): data = sklearn.datasets.load_iris() clf = SigOptSearchCV(SVC(gamma='auto'), SVC_PARAM_DOMAIN, client_token='client_token', n_iter=5, cv=3) clf.fit(data['data'], data['target'])