def test_searchcv_callback(): # Test whether callback is used in BayesSearchCV and # whether is can be used to interrupt the search loop X, y = load_iris(True) opt = BayesSearchCV( DecisionTreeClassifier(), { 'max_depth': [3], # additional test for single dimension 'min_samples_split': Real(0.1, 0.9), }, n_iter=5) total_iterations = [0] def callback(opt_result): # this simply counts iterations total_iterations[0] += 1 # break the optimization loop at some point if total_iterations[0] > 2: return True # True == stop optimization return False opt.fit(X, y, callback=callback) assert total_iterations[0] == 3 # test whether final model was fit opt.score(X, y)
def test_searchcv_callback(): # Test whether callback is used in BayesSearchCV and # whether is can be used to interrupt the search loop X, y = load_iris(True) opt = BayesSearchCV( DecisionTreeClassifier(), { 'max_depth': [3], # additional test for single dimension 'min_samples_split': Real(0.1, 0.9), }, n_iter=5 ) total_iterations = [0] def callback(opt_result): # this simply counts iterations total_iterations[0] += 1 # break the optimization loop at some point if total_iterations[0] > 2: return True # True == stop optimization return False opt.fit(X, y, callback=callback) assert total_iterations[0] == 3 # test whether final model was fit opt.score(X, y)
def trainB(self, X_list, y_list, n_points=1, space=spaceB, cv=5): """ BayesianSearchCV method :param X_list: List of training sets :param y_list: List of targets :param space: parameter space :return: models an metrics """ n_calls = self.n_calls scores = [] val_scores = [] best_models = [] for j in range(len(X_list)): classifier = RandomForestClassifier(n_jobs=-1) y = y_list.copy() X = X_list.copy() y_test = y.pop(j) X_test = X.pop(j) y_train = np.concatenate(y, axis=0) X_train = np.concatenate(X, axis=0) X_train = Features().div_cols(X_train).values X_test = Features().div_cols(X_test).values start = time() opt = BayesSearchCV(classifier, search_spaces=space, scoring=self.scorer, cv=cv, n_points=n_points, n_iter=n_calls, n_jobs=-1) opt.fit(X_train, y_train) model = opt.best_estimator_ print('Season', 2019 - j) print("Bayes CV search took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_calls)) print("val. score:", opt.best_score_) print("test score:", opt.score(X_test, y_test)) # print(model) print("") best_models.append(model) val_scores.append(opt.best_score_) scores.append(opt.score(X_test, y_test)) return scores, val_scores, best_models
def run_optimization_test(): N_iter = 100 # log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( TemplateClassifier(), { "deltaEta": Real(0.0, 4.0, prior="uniform"), "deltaPhi": Real(0.0, 4.0, prior="uniform"), "maxNRegions": Integer(2, 100), "maxNVertices": Integer(1, 5), "nSigmaZBeamSpot": Real(0.0, 30.0, prior="uniform"), "nSigmaZVertex": Real(-1.0, 1.0, prior="uniform"), "originRadius": Real(0.0, 1.0, prior="uniform"), "ptMin": Real(0.0, 2.0, prior="uniform"), "zErrorBeamSpot": Real(0.0, 1.0, prior="uniform"), "zErrorVetex": Real(0.0, 1.0, prior="uniform"), }, n_iter=N_iter, cv=[(slice(None), slice(None))], verbose=1, # scoring="accuracy" ) opt.fit(np.zeros((100, 1)), np.zeros((100))) print("After {} iterations:".format(N_iter)) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(0.0, 0.0)) print("Final params:") params = opt.best_estimator_.get_params() for i, (param, val) in enumerate(params.items()): print("{0}:\t{1:2.2f} vs {2:2.2f}".format(param, val, targets[i]))
def _fit_svc(n_jobs=1, n_points=1, cv=None): """ Utility function to fit a larger classification task with SVC """ X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_informative=18, random_state=1, n_clusters_per_class=1) opt = BayesSearchCV( SVC(), { 'C': Real(1e-3, 1e+3, prior='log-uniform'), 'gamma': Real(1e-3, 1e+1, prior='log-uniform'), 'degree': Integer(1, 3), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, random_state=42, ) opt.fit(X, y) assert opt.score(X, y) > 0.9
def bayesian_optimization(model, space, scorer, x_train, y_train, x_test, y_test, n_iter=256, cv=4, n_jobs=None): global counter global opt if n_jobs is None: n_jobs = cv opt = BayesSearchCV(model, space, scoring=scorer, n_iter=n_iter, cv=cv, verbose=10, n_jobs=n_jobs) counter = 0 opt.fit(x_train, y_train, callback=on_step) print(opt.best_params_) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(x_test, y_test))
def testSVMParams(self,pipe): print("{} - CALCULATING BEST PARAMETERS... \n".format(datetime.datetime.now())) X_train, X_test, y_train, y_test = train_test_split(self.df, self.classes.values, train_size=0.75, test_size=.25, random_state=0) listaC = [0.001, 0.01, 0.1, 1, 10,100] listaGamma = [0.001, 0.01, 0.1, 1, 10, 100] listaKernels = ['rbf','linear','poly','sigmoid'] if self.problem == 'classification': paramsGrid = dict(clf__C=listaC, clf__gamma=listaGamma, clf__kernel=listaKernels) grid = BayesSearchCV(pipe,paramsGrid,scoring='accuracy',n_iter=9) elif self.problem == 'regression': if isinstance(self.classes,pd.DataFrame): paramsGrid = dict(reg__estimator__C=listaC, reg__estimator__gamma=listaGamma, reg__estimator__kernel=listaKernels) else: paramsGrid = dict(reg__C=listaC, reg__gamma=listaGamma, reg__kernel=listaKernels) grid = BayesSearchCV(pipe,paramsGrid,scoring='r2',n_iter=9) # print("DF: \n {}".format(self.df)) # print("CLASSES: \n {}".format(self.classes)) print("{} - FITTING DATA... \n".format(datetime.datetime.now())) grid.fit(X_train,y_train) print("{} - BEST RESULTS - {}".format(datetime.datetime.now(),grid.best_score_)) print("{} - TEST RESULTS: {}".format(datetime.datetime.now(),grid.score(X_test, y_test))) return grid.best_params_
def get_bayes_scikit_score_cv(X_train,y_train,X_test,y_test, X_val=None, y_val= None, max_evals = 25, folds=5, original = None): space = get_baesian_space(dictem = True) opt_cat = BayesSearchCV(CatBoostClassifier(logging_level='Silent'), space['CAT'], n_iter = max_evals, random_state = 0) opt_xgb = BayesSearchCV(XGBClassifier(), space['XGB'], n_iter = max_evals, random_state = 0) opt_lgbm = BayesSearchCV(LGBMClassifier(), space['LGBM'], n_iter = max_evals, random_state = 0) _ = opt_cat.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) __ = opt_xgb.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) ___ = opt_lgbm.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) scores = [opt_cat.score(X_test, y_test), opt_xgb.score(X_test, y_test), opt_lgbm.score(X_test, y_test)] train_scores = [opt_cat.best_score_, opt_xgb.best_score_, opt_lgbm.best_score_] score = max(scores) cross_score = max(train_scores) neptune.log_metric(f'skopt-{max_evals}-iterations-{folds}-folds', score) neptune.log_metric('skopt train holdout score', cross_score) return score
def test_searchcv_runs(surrogate, n_jobs): """ Test whether the cross validation search wrapper around sklearn models runs properly with available surrogates and with single or multiple workers. Parameters ---------- * `surrogate` [str or None]: A class of the scikit-optimize surrogate used. None means to use default surrogate. * `n_jobs` [int]: Number of parallel processes to use for computations. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # None search space is only supported when only `step` function is used assert_raises(ValueError, BayesSearchCV(SVC(), None).fit, (X, y)) # check if invalid dimensions are raising errors with pytest.raises(ValueError): BayesSearchCV(SVC(), {'C': '1 ... 100.0'}) with pytest.raises(TypeError): BayesSearchCV(SVC(), ['C', (1.0, 1)]) # create an instance of a surrogate if it is not a string if surrogate is not None: optimizer_kwargs = {'base_estimator': surrogate} else: optimizer_kwargs = None opt = BayesSearchCV( SVC(), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_jobs=n_jobs, n_iter=11, optimizer_kwargs=optimizer_kwargs ) opt.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert_greater(opt.score(X_test, y_test), 0.9)
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] opt = BayesSearchCV( estimator=GradientBoostingClassifier(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gradient_boosting.py search_spaces={ 'learning_rate': Real(0.01, 1, 'log-uniform'), 'n_estimators': Integer(50, 2000), 'subsample': Real(0.01, 1.0, 'uniform'), 'max_depth': Integer(1, 10), 'max_features': Real(0.1, 1.0, 'uniform'), 'min_samples_split': Integer(2, 20), 'min_samples_leaf': Integer(1, 20), 'criterion': ['friedman_mse', 'mse', 'mae'] }, cv=StratifiedKFold(n_splits=10, shuffle=True), n_jobs=3, n_iter=100, verbose=0, refit=True, random_state=42) def status_print(_): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(opt.cv_results_) best_parap_copy = copy.deepcopy(opt.best_params_) for k, v in opt.best_params_.items(): best_parap_copy[k] = v if isinstance(v, str) or isinstance( v, float) else v.item() param_list = [] for each in json.dumps(best_parap_copy)[1:-1].split(', '): param_list.append('='.join(each[1:].split('": '))) if hasattr(opt.estimator, 'verbose'): param_list.append('verbose=True') param = opt.estimator.__class__.__name__ + \ '(' + ', '.join(param_list) + ')' # Get current parameters and the best parameters print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format( len(all_models), np.round(opt.best_score_, 4), param)) opt.fit(X_train, y_train, callback=status_print) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test))
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values # log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( estimator=SVC(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py search_spaces={ 'C': Real(1e-6, 1e+6, 'log-uniform'), 'gamma': Real(3.0517578125e-05, 8, 'log-uniform'), 'kernel': ['rbf', 'poly', 'sigmoid'], # categorical parameter 'decision_function_shape': ['ovo', 'ovr'], 'degree': Integer(2, 5), 'coef0': Real(-1, 1, 'uniform'), 'tol': Real(1e-5, 1e-1, 'log-uniform') }, cv=StratifiedKFold(n_splits=10, shuffle=True), n_jobs=3, n_iter=100, verbose=0, refit=True) def status_print(_): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(opt.cv_results_) best_parap_copy = copy.deepcopy(opt.best_params_) for k, v in opt.best_params_.items(): best_parap_copy[k] = v if isinstance(v, str) or isinstance( v, float) else v.item() param_list = [] for each in json.dumps(best_parap_copy)[1:-1].split(', '): param_list.append('='.join(each[1:].split('": '))) if hasattr(opt.estimator, 'verbose'): param_list.append('verbose=True') param = opt.estimator.__class__.__name__ + \ '(' + ', '.join(param_list) + ')' # Get current parameters and the best parameters print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format( len(all_models), np.round(opt.best_score_, 4), param)) opt.fit(X_train, y_train, callback=status_print) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test))
def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None): """ Test whether the cross validation search wrapper around sklearn models runs properly with available surrogates and with single or multiple workers and different number of parameter settings to ask from the optimizer in parallel. Parameters ---------- * `surrogate` [str or None]: A class of the scikit-optimize surrogate used. None means to use default surrogate. * `n_jobs` [int]: Number of parallel processes to use for computations. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0) # create an instance of a surrogate if it is not a string if surrogate is not None: optimizer_kwargs = {'base_estimator': surrogate} else: optimizer_kwargs = None opt = BayesSearchCV(SVC(), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, optimizer_kwargs=optimizer_kwargs) opt.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert opt.score(X_test, y_test) > 0.9
def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None): """ Test whether the cross validation search wrapper around sklearn models runs properly with available surrogates and with single or multiple workers and different number of parameter settings to ask from the optimizer in parallel. Parameters ---------- * `surrogate` [str or None]: A class of the scikit-optimize surrogate used. None means to use default surrogate. * `n_jobs` [int]: Number of parallel processes to use for computations. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # create an instance of a surrogate if it is not a string if surrogate is not None: optimizer_kwargs = {'base_estimator': surrogate} else: optimizer_kwargs = None opt = BayesSearchCV( SVC(), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, optimizer_kwargs=optimizer_kwargs ) opt.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert_greater(opt.score(X_test, y_test), 0.9)
def hyperparam_search(model_config, train, test): """Perform hyperparameter search using Bayesian optimization on a given model and dataset. Args: model_config (dict): the model and the parameter ranges to search in. Format: { "name": str, "model": sklearn.base.BaseEstimator, "params": dict } train (pandas.DataFrame): training data test (pandas.DataFrame): test data """ X_train = train.drop("label", axis=1) y_train = train.label X_test = test.drop("label", axis=1) y_test = test.label opt = BayesSearchCV( model_config["model"], model_config["params"], n_jobs=4, cv=5, random_state=RANDOM_SEED, ) opt.fit(X_train, y_train) acc = opt.score(X_test, y_test) print(f"{model_config['name']} results:") print(f"Best validation accuracy: {opt.best_score_}") print(f"Test set accuracy: {acc}") print(f"Best parameters:") for param, value in opt.best_params_.items(): print(f"- {param}: {value}") return { "name": model_config["name"], "class": model_config["class"], "model": opt.best_estimator_, "params": opt.best_params_, "score": acc, }
def test_searchcv_refit(): """ Test whether results of BayesSearchCV can be reproduced with a fixed random state. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) random_state = 42 opt = BayesSearchCV( SVC(random_state=random_state), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_iter=11, random_state=random_state ) opt2 = BayesSearchCV( SVC(random_state=random_state), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_iter=11, random_state=random_state, refit=True ) opt.fit(X_train, y_train) opt2.best_estimator_ = opt.best_estimator_ opt2.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert opt2.score(X_test, y_test) > 0.9
def set_model(self, model_type, hyper_parameters=None): """Create a surrogate model""" base_model = self.models[model_type]['model'] hyper_model = self.models[model_type]['model'] if hyper_parameters: hyper_model = BayesSearchCV(base_model, hyper_parameters, refit=True, n_jobs=16, cv=self.cv, n_iter=self.number_iters, random_state=self.random) fit = hyper_model.fit(self.scaled_var_train, self.scaled_obj_train) r2_score = hyper_model.score(self.scaled_var_test, self.scaled_obj_test) mse_score = self._get_mse(hyper_model) self.models[model_type].update({ 'model': hyper_model, 'fit': fit, 'score': r2_score, 'mse_score': mse_score, 'hyper_parameters': hyper_model.best_params_, 'cv_results': hyper_model.cv_results_ }) else: fit = base_model.fit(self.scaled_var_train, self.scaled_obj_train) r2_score = base_model.score(self.scaled_var_test, self.scaled_obj_test) mse_score = self._get_mse(base_model) self.models[model_type].update({ 'fit': fit, 'score': r2_score, 'mse_score': mse_score, 'hyper_parameters': None, 'cv_results': None })
def _fit_svc(n_jobs=1, n_points=1, cv=None): """ Utility function to fit a larger classification task with SVC """ X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_informative=18, random_state=1, n_clusters_per_class=1) opt = BayesSearchCV( SVC(), { 'C': Real(1e-3, 1e+3, prior='log-uniform'), 'gamma': Real(1e-3, 1e+1, prior='log-uniform'), 'degree': Integer(1, 3), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, random_state=42, ) opt.fit(X, y) assert_greater(opt.score(X, y), 0.9)
def random_forest(attributes, prediction): x_train, x_test, y_train, y_test = train_test_split(attributes, prediction, random_state=0) rf = RandomForestRegressor(n_estimators=200, random_state=42) opt = BayesSearchCV(rf, { 'n_estimators': Integer(200, 2000), 'max_features': Categorical(['auto', 'sqrt']), 'max_depth': Integer(10, 110), 'min_samples_split': Integer(2, 10), 'min_samples_leaf': Integer(1, 4), 'bootstrap': Categorical([True, False]), }, n_iter=32, cv=5, n_jobs=-1, verbose=2) opt.fit(x_train, y_train) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(x_test, y_test)) params_from_bayes = opt.best_params_ bayes_rf = RandomForestRegressor(**params_from_bayes) scoring = ['accuracy', 'precision', 'recall', 'roc_auc', 'f1'] bayes_scores = cross_validate(bayes_rf, attributes, prediction, scoring=scoring, cv=10) print(simplify_scores(bayes_scores))
# log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV(SVC(), search_space, n_iter=32, cv=3, n_jobs=-1) @timing def dummy(func, X_train, y_train): func(X_train, y_train) return None dummy(opt.fit, X_train, y_train) print("val. score: {}".format(opt.best_score_)) print("test score: {}".format(opt.score(X_test, y_test))) param_grid = { 'C': [1e-6, 1e-3, 1, 1e3, 1e+6], 'gamma': [1e-6, 1e-4, 1e-2, 1.0, 1e+1], 'degree': [1, 4, 8], # integer valued parameter 'kernel': ['linear', 'poly', 'rbf'], # categorical parameter } grid_opt = GridSearchCV(SVC(), param_grid, cv=3, n_jobs=-1, return_train_score=True) dummy(grid_opt.fit, X_train, y_train)
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values # log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( estimator=xgb.XGBClassifier(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/xgradient_boosting.py search_spaces={ 'learning_rate': Real(0.001, 1.0, 'log-uniform'), 'min_child_weight': Integer(0, 20), 'max_depth': Integer(0, 50), 'max_delta_step': Integer(0, 20), 'subsample': Real(0.01, 1.0, 'uniform'), 'colsample_bytree': Real(0.01, 1.0, 'uniform'), 'colsample_bylevel': Real(0.01, 1.0, 'uniform'), 'reg_lambda': Real(1e-10, 1e-1, 'log-uniform'), 'reg_alpha': Real(1e-10, 1e-1, 'log-uniform'), 'gamma': Real(1e-9, 0.5, 'log-uniform'), 'n_estimators': Integer(50, 512), 'scale_pos_weight': Real(1e-6, 500, 'log-uniform'), 'booster': ["gbtree", "dart"], 'sample_type': ['uniform', 'weighted'], 'normalize_type': ['tree', 'forest'], 'rate_drop': Real(1e-10, 1 - (1e-10), 'uniform') }, cv=StratifiedKFold(n_splits=10, shuffle=True), n_jobs=3, n_iter=100, verbose=0, refit=True, random_state=42) def status_print(_): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(opt.cv_results_) best_parap_copy = copy.deepcopy(opt.best_params_) for k, v in opt.best_params_.items(): best_parap_copy[k] = v if isinstance(v, str) or isinstance( v, float) else v.item() param_list = [] for each in json.dumps(best_parap_copy)[1:-1].split(', '): param_list.append('='.join(each[1:].split('": '))) if hasattr(opt.estimator, 'verbose'): param_list.append('verbose=True') param = opt.estimator.__class__.__name__ + \ '(' + ', '.join(param_list) + ')' # Get current parameters and the best parameters print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format( len(all_models), np.round(opt.best_score_, 4), param)) opt.fit(X_train, y_train, callback=status_print) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test))
def NestedBayesCV(tups, y_train, cv_outer = StratifiedKFold(10, random_state=42), cv_inner = StratifiedKFold(10, random_state=42), nested = True, n_iter = 60, random_state = 42, scoring='accuracy'): """ Given a tuple of algorithms/different datasets/names/parameters this will perform randomised cross validation using nested CV to choose the best parameters for each combination of model and datset. params: - tups : a list of tuples containing algorithm/datasets/names/parameters - y_train : train labels - cv_outer : type of CV to be used for the outer validation - cv_inner : type of CV to be used for the inner validation, i.e., hyperparameter optimization - nested : whether to performe nested CV - n_iter: number of iterations for the randomized search - random_state: seed for the randomized search - scoring: scoring metric to be used """ results = [] if nested == True: for i in range(0, len(tups)): for j in range(0, len(tups[i][1])): for train, test in cv_outer.split(tups[i][1][j], y_train): tX_train = tups[i][1][j][train] ty_train = y_train[train] tX_test = tups[i][1][j][test] ty_test = y_train[test] opt = BayesSearchCV(tups[i][0], tups[i][3], cv = cv_inner, random_state = random_state, scoring=scoring, n_jobs=-1, n_iter = n_iter) opt.fit(tX_train, ty_train) results.append({'Combo':tups[i][2][j], 'Best_Params': opt.best_params_, 'Outer_CV': opt.score(tX_test, ty_test), 'Inner_CV': opt.best_score_}) print('Finished with' + ' ' + tups[i][2][j]) results = pd.DataFrame(results) results['Best_Params'] = results['Best_Params'].apply(str) if scoring == 'neg_mean_squared_error': results['Inner_CV'] = np.sqrt(-results['Inner_CV']) results['Outer_CV'] = np.sqrt(-results['Outer_CV']) if scoring in ['neg_log_loss', 'neg_mean_absolute_error', 'neg_median_absolute_error', 'neg_mean_absolute_error']: results['Inner_CV'] = -results['Inner_CV'] results['Outer_CV'] = -results['Outer_CV'] grouped = results.groupby(['Combo', 'Best_Params']) to_aggregate = {'Inner_CV':['mean', 'sem'], 'Outer_CV':['mean', 'sem', 'count']} results = grouped.agg(to_aggregate) else: for i in range(0, len(tups)): for j in range(0, len(tups[i][1])): opt = BayesSearchCV(tups[i][0], tups[i][3], cv = cv_inner, random_state = random_state, scoring=scoring, n_jobs=-1, n_iter = n_iter) opt.fit(X_train, y_train) results.append({'Combo':tups[i][2][j], 'Best_Params': opt.best_params_, 'Score_CV': opt.best_score_}) print('Finished with' + ' ' + tups[i][2][j]) results = pd.DataFrame(results) results['Best_Params'] = results['Best_Params'].apply(str) if scoring == 'neg_mean_squared_error': results['Score_CV'] = np.sqrt(-results['Score_CV']) if scoring in ['neg_log_loss', 'neg_mean_absolute_error', 'neg_median_absolute_error', 'neg_mean_absolute_error']: results['Score_CV'] = -results['Score_CV'] grouped = results.groupby(['Combo', 'Best_Params']) to_aggregate = {'Score_CV':['mean', 'sem', 'count']} results = grouped.agg(to_aggregate) return results
#BayesSearch lr_param_grid = { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'solver': Categorical(['newton-cg', 'lbfgs', 'liblinear']) } lr = LogisticRegression(max_iter=150) lr_bs = BayesSearchCV(lr, lr_param_grid, scoring='accuracy', cv=strat_k_fold, n_iter=20) lr_bs.fit(X, y) print("best BS test score:", lr_bs.best_score_) print("best BS total score:", lr_bs.score(X, y)) print("best BS params:", lr_bs.best_params_) #%% #RF strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) #GridSearch rf_param_grid = { 'n_estimators': [5, 15, 50, 100], 'max_depth': [5, 15, 25], 'min_samples_leaf': [1, 3], 'max_leaf_nodes': [10, 20, 50, 100] }
cv_folds = [train_test_split(range(len(X)), train_size=0.666)] model = BayesSearchCV(estimator=pipe, search_spaces={ 'model__latent_dim': (2, 20), 'model__intermediate_dim': (8, 128), 'model__epochs': (8, 128), 'model__D': (1e-3, 1e+3, 'log-uniform'), 'model__lr': (1e-4, 1e-2, 'log-uniform'), }, n_iter=32, cv=cv_folds, refit=False, error_score=-1.0) model.on_step = lambda x: print( (x, model.total_iterations(), model.best_score_)) model.fit(X, Y) model.refit = True model._fit_best_model(X, Y) print(model.best_params_) print(model.score(X, Y)) """ model = pipe model.set_params(**{'model__D': 5.1964624423233898, 'model__lr': 0.00010138257365940301, 'model__epochs': 26, 'model__intermediate_dim': 125, 'model__latent_dim': 2}) model.fit(X, Y) print(model.predict(X, Y)) """
# log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( SVC(), { 'C': (1e-6, 1e+6, 'log-uniform'), 'gamma': (1e-6, 1e+1, 'log-uniform'), 'degree': (1, 8), # integer valued parameter 'kernel': ['linear', 'poly', 'rbf'], # categorical parameter }, n_iter=32, cv=3) opt.fit(X_train, y_train) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test)) ############################################################################# # Advanced example # ================ # # In practice, one wants to enumerate over multiple predictive model classes, # with different search spaces and number of evaluations per class. An # example of such search over parameters of Linear SVM, Kernel SVM, and # decision trees is given below. from skopt import BayesSearchCV from skopt.space import Real, Categorical, Integer from skopt.plots import plot_objective, plot_histogram from sklearn.datasets import load_digits
from sklearn.svm import SVC from sklearn.model_selection import train_test_split import timeit class BayesSearchCV(BayesSearchCV): def _run_search(self, x): raise BaseException('Use newer skopt') X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train') X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k') X_train = X_train[:10000] y_train = y_train[:10000] start = timeit.default_timer() opt = BayesSearchCV(SVC(), { 'C': Real(0.001, 10, prior='log-uniform'), 'gamma': Real(0.001, 1, prior='log-uniform'), 'kernel': Categorical(['poly']), }, n_iter=40) opt.fit(X_train, y_train) stop = timeit.default_timer() print("Bayes on fashion data with kernel poly") print(opt.score(X_test, y_test)) print(opt.best_params_) print('Time: ', stop - start)
'max_depth': (3, 15), 'gamma': (1e-6, 1e+1, 'log-uniform'), 'learning_rate': (0.01, 0.4, 'log-uniform'), 'min_child_weight': (1, 10), 'subsample': (0.5, 1.0, 'log-uniform'), 'colsample_bytree': (0.5, 1.0, 'log-uniform'), 'n_estimators': (100, 1000) }, n_iter=32, random_state=42, cv=3 ) xgb_opt.fit(X_train, Y_train) xgb_opt.score(X_train, Y_train) #Accuracy of the model on the validation set y_pred = xgb_opt.predict(X_test) from sklearn import metrics print("Accuracy:",metrics.accuracy_score(Y_test, y_pred)) #Load the testSpike data X_test = mat['testSpike'] X_test.shape #Pre-processing of the testSpike data l = 2 X = numpy.array([]) X = numpy.mean(X_test[:, 0: 1], axis = 1)[numpy.newaxis].T for i in range(2, len(X_test[0])+1):
cb, cb_params, n_iter=32, fit_params=dict(cat_features=cat_feats), # n_jobs=2 ) X = app_train.drop(['SK_ID_CURR', "TARGET"], axis=1) y = app_train["TARGET"] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.75) opt.fit(X_train, y_train) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test)) print(opt.cv_results_) with open("wwwww.txt", "w+") as f: f.write(opt.best_score_) f.write("\n") f.write(opt.score(X_test, y_test)) f.write("\n") f.write(opt.cv_results_) # # print("start xgboost") # xg_params = dict( # n_estimators=(100, 400), # max_depth=(5, 20), # colsample_bytree=(0.6, 1), # reg_alpha=(0.01, 10),