def _fit(self, n_calls: int = 100, n_random_starts: int = 10, n_restarts_optimizer: int = 50, random_state: int = 42, early_stopping_delta: float = 0.001, early_stopping_best_models: int = 5, n_jobs: int = -1): """Minimize the function score.""" self._space.rasterize() history = History(self._space, self._maximization_problem) results = gp_minimize(func=self._decorate_score(self._score), dimensions=self._space.space, n_calls=n_calls, n_random_starts=n_random_starts, n_jobs=n_jobs, n_restarts_optimizer=n_restarts_optimizer, callback=[ TQDMGaussianProcess(n_calls), history, DeltaYStopper( early_stopping_delta, n_best=early_stopping_best_models) ], random_state=random_state) self._best_parameters = self._space.inflate_results(results) return history
def test_deltay_stopper(): deltay = DeltaYStopper(0.2, 3) Result = namedtuple('Result', ['func_vals']) assert deltay(Result([0, 1, 2, 3, 4, 0.1, 0.19])) assert not deltay(Result([0, 1, 2, 3, 4, 0.1])) assert deltay(Result([0, 1])) is None
def get_bayes_scikit_score_cv(X_train,y_train,X_test,y_test, X_val=None, y_val= None, max_evals = 25, folds=5, original = None): space = get_baesian_space(dictem = True) opt_cat = BayesSearchCV(CatBoostClassifier(logging_level='Silent'), space['CAT'], n_iter = max_evals, random_state = 0) opt_xgb = BayesSearchCV(XGBClassifier(), space['XGB'], n_iter = max_evals, random_state = 0) opt_lgbm = BayesSearchCV(LGBMClassifier(), space['LGBM'], n_iter = max_evals, random_state = 0) _ = opt_cat.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) __ = opt_xgb.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) ___ = opt_lgbm.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)]) scores = [opt_cat.score(X_test, y_test), opt_xgb.score(X_test, y_test), opt_lgbm.score(X_test, y_test)] train_scores = [opt_cat.best_score_, opt_xgb.best_score_, opt_lgbm.best_score_] score = max(scores) cross_score = max(train_scores) neptune.log_metric(f'skopt-{max_evals}-iterations-{folds}-folds', score) neptune.log_metric('skopt train holdout score', cross_score) return score
def optimize(self, loss: BaseLoss, n_calls: int = 100, max_calls: int = 100, n_random_starts: int = 30, use_cache=True, initial_parameter_points: Optional[np.array] = None, delta: float = 0.0, random_state: int = 1, verbose=True, **kwargs): result = self.load_cached_result(loss) x0, y0 = get_initial_points(result) if len(initial_parameter_points) > 0 and x0 is not None: x0 += initial_parameter_points # ensure max_calls is respected over past calls n_past_calls = 0 if x0 is None else len(x0) n_calls = min(n_calls, max_calls - n_past_calls) # skopt minimize requires at least the n_random_starts if n_calls < n_random_starts: print( "Returning due to few remaining calls (n_calls < n_random_starts)" ) return result opts = dict(dimensions=self.dimensions, acq_func=self.acq_func, n_calls=n_calls, n_random_starts=n_random_starts, callback=[DeltaYStopper(delta)], x0=x0, y0=y0, n_jobs=-1, random_state=random_state, verbose=verbose) opts.update(kwargs) result = self.skopt_minimize(loss, **opts) if use_cache: self.cache_result(loss, result) return result
def _set_callbacks(self, names=['saver', 'stopper', 'timer', 'verb']): os.makedirs(self.chkpoint_root, exist_ok=True) checkpoint_saver = CheckpointSaver(self.tune_name + '.pkl', compress=9, store_objective=False) stopper = DeltaYStopper(self.params['tune']['stop_margin'], self.params['tune']['stopper_patience']) timer = TimerCallback() verb = VerboseCallback(n_total=1) callbacks = { 'saver': checkpoint_saver, 'stopper': stopper, 'timer': timer, 'verb': verb } self.callbacks = [callbacks[x] for x in names]
mode='u') if task[0][ 'name'] != "A-E+A-P": #TODO: search for a more elegant solution X_train_int, y_train_int = filter_by_tasks( X_train_int, y_train_int, task) X_val, y_val = filter_by_tasks(X_val, y_val, task) root_logger.debug( "Internal Train size: {}, Validation size: {}". format(len(X_train_int), len(X_val))) root_logger.debug( "BAYESIAN OPTIMIZER - Started to search params") delta_stopper = DeltaYStopper( n_best=BOconfig["n_best"], delta=BOconfig["delta"]) min_res = gp_minimize( func=fitness, dimensions=mlp_parameters_space, acq_func=BOconfig["acq_function"], callback=[delta_stopper], n_calls=BOconfig["nBayesianOptCall"]) root_logger.debug( "BAYESIAN OPTIMIZER - Best parameters found: {}". format(min_res.x)) if experiment == "bayesianMLP": hidden_layers_comb = get_hidden_layers_combinations( BOconfig["hiddenLayers"], 3, BOconfig["allowFirstLevelZero"])
def get_params_SKopt(model, X, Y, space, cv_search, alg = 'catboost', cat_features = None, eval_dataset = None, UBM = False, opt_method = 'gbrt_minimize', verbose = True, multi = False, scoring = 'neg_mean_squared_error', n_best = 50, total_time = 7200): """The method performs parameters tuning of an algorithm using scikit-optimize library. Parameters: 1. 2. 3. multi - boolean, is used when a multioutput algorithm is tuned UPDATES: 1. In this current version, the support of the catboost algorithms is added """ if alg == 'catboost': fitparam = { 'eval_set' : eval_dataset, 'use_best_model' : UBM, 'cat_features' : cat_features, 'early_stopping_rounds': 20 } else: fitparam = {} @use_named_args(space) def objective(**params): model.set_params(**params) return -np.mean(cross_val_score(model, X, Y, cv=cv_search, scoring= scoring, fit_params=fitparam)) if opt_method == 'gbrt_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'EI',} reg_gp = gbrt_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [DeltaYStopper(delta = 0.01, n_best = 5), RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) elif opt_method == 'forest_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'EI',} reg_gp = forest_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) elif opt_method == 'gp_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'gp_hedge',} reg_gp = gp_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) TUNED_PARAMS = {} for i, item in enumerate(space): if multi: TUNED_PARAMS[item.name.split('__')[1]] = reg_gp.x[i] else: TUNED_PARAMS[item.name] = reg_gp.x[i] return [TUNED_PARAMS,reg_gp]
]) df = pd.read_csv(filename, index_col=0) score = int(-df["mean"].values.flatten()[0]) return score space = Space({ "oblivion": (0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0), "mu_memory": [0, 1000], "mu_data": [0, 1000] }) n_calls = 1000 gp = GaussianProcess(score, space) with Notipy(): results = gp.minimize(n_calls=n_calls, n_random_starts=10, callback=[ TQDMGaussianProcess(n_calls=n_calls), DeltaYStopper(**{ "delta": 1, "n_best": 100 }) ], random_state=42, n_jobs=cpu_count()) print(gp.best_parameters) pd.DataFrame(dict(gp.best_parameters), index=[0]).to_json("best_parameters.json")
def _check_parameters(self): """Check the validity of the input parameters.""" if self.mapping is None: self.mapping = {str(v): v for v in sorted(self.y.unique())} if self.scaled is None: self.scaled = check_scaling(self.X) # Create model subclasses ================================== >> models = [] for m in self._models: if isinstance(m, str): acronym = get_acronym(m, must_be_equal=False) # Check if packages for non-sklearn models are available if acronym in OPTIONAL_PACKAGES: try: importlib.import_module(OPTIONAL_PACKAGES[acronym]) except ImportError: raise ValueError( f"Unable to import the {OPTIONAL_PACKAGES[acronym]} " "package. Make sure it is installed.") # Check for regression/classification-only models if self.goal.startswith("class") and acronym in ONLY_REG: raise ValueError( f"The {acronym} model can't perform classification tasks!" ) elif self.goal.startswith("reg") and acronym in ONLY_CLASS: raise ValueError( f"The {acronym} model can't perform regression tasks!") models.append(MODEL_LIST[acronym](self, acronym + m[len(acronym):])) elif not isinstance(m, BaseModel): # Model is custom estimator models.append(CustomModel(self, estimator=m)) else: # Model is already a model subclass (can happen with reruns) models.append(m) self._models = CustomDict({m.name: m for m in models}) # Check validity metric ==================================== >> if None in self._metric: self._metric = CustomDict(get_default_metric(self.task)) # Ignore if it's the same metric as previous call elif not all([hasattr(m, "name") for m in self._metric]): self._metric = self._prepare_metric( metric=self._metric, greater_is_better=self.greater_is_better, needs_proba=self.needs_proba, needs_threshold=self.needs_threshold, ) # Check validity sequential parameters ===================== >> for param in ["n_calls", "n_initial_points", "bagging"]: p = lst(getattr(self, param)) if len(p) != 1 and len(p) != len(self._models): raise ValueError( f"Invalid value for the {param} parameter. Length " "should be equal to the number of models, got len" f"(models)={len(self._models)} and len({param})={len(p)}.") for i, model in enumerate(self._models): if param in ("n_calls", "bagging") and p[i % len(p)] < 0: raise ValueError( f"Invalid value for the {param} parameter. " f"Value should be >=0, got {p[i % len(p)]}.") elif param == "n_initial_points" and p[i % len(p)] <= 0: raise ValueError( f"Invalid value for the {param} parameter. " f"Value should be >0, got {p[i % len(p)]}.") setattr(model, "_" + param, p[i % len(p)]) # Prepare bo parameters ===================================== >> # Choose a base estimator (GP is chosen as default) self._base_estimator = self.bo_params.get("base_estimator", "GP") if isinstance(self._base_estimator, str): if self._base_estimator.lower() not in ("gp", "et", "rf", "gbrt"): raise ValueError( f"Invalid value for the base_estimator parameter, got " f"{self._base_estimator}. Value should be one of: 'GP', " f"'ET', 'RF', 'GBRT'.") if self.bo_params.get("callbacks"): self._callbacks = lst(self.bo_params["callbacks"]) if "max_time" in self.bo_params: if self.bo_params["max_time"] <= 0: raise ValueError( "Invalid value for the max_time parameter. " f"Value should be >0, got {self.bo_params['max_time']}.") self._callbacks.append(DeadlineStopper(self.bo_params["max_time"])) if "delta_x" in self.bo_params: if self.bo_params["delta_x"] < 0: raise ValueError( "Invalid value for the delta_x parameter. " f"Value should be >=0, got {self.bo_params['delta_x']}.") self._callbacks.append(DeltaXStopper(self.bo_params["delta_x"])) if "delta_y" in self.bo_params: if self.bo_params["delta_y"] < 0: raise ValueError( "Invalid value for the delta_y parameter. " f"Value should be >=0, got {self.bo_params['delta_y']}.") self._callbacks.append( DeltaYStopper(self.bo_params["delta_y"], n_best=5)) if self.bo_params.get("plot"): self._callbacks.append(PlotCallback(self)) if "cv" in self.bo_params: if self.bo_params["cv"] <= 0: raise ValueError( "Invalid value for the max_time parameter. " f"Value should be >=0, got {self.bo_params['cv']}.") self._cv = self.bo_params["cv"] if "early_stopping" in self.bo_params: if self.bo_params["early_stopping"] <= 0: raise ValueError( "Invalid value for the early_stopping parameter. " f"Value should be >=0, got {self.bo_params['early_stopping']}." ) self._early_stopping = self.bo_params["early_stopping"] # Add custom dimensions to every model subclass if self.bo_params.get("dimensions"): for name, model in self._models.items(): # If not dict, the dimensions are for all models if not isinstance(self.bo_params["dimensions"], dict): model._dimensions = self.bo_params["dimensions"] else: # Dimensions for every specific model for key, value in self.bo_params["dimensions"].items(): # Parameters for this model only if key.lower() == name: model._dimensions = value break kwargs = [ "base_estimator", "max_time", "delta_x", "delta_y", "early_stopping", "cv", "callbacks", "dimensions", "plot", ] # The remaining bo_params are added as kwargs to the optimizer self._bo_kwargs = { k: v for k, v in self.bo_params.items() if k not in kwargs } # Prepare est_params ======================================= >> if self.est_params: for name, model in self._models.items(): params = {} for key, value in self.est_params.items(): # Parameters for this model only if key.lower() == name: params.update(value) # Parameters for all models elif key.lower() not in self._models.keys(): params.update({key: value}) for key, value in params.items(): if key.endswith("_fit"): model._est_params_fit[key[:-4]] = value else: model._est_params[key] = value
def bayesian_cnn_exp(gene, mode): BOconfig = Config.get("bayesianOpt") mlp_parameters_space = [ conf_to_params(conf) for conf in Config.get("bayesianOpt")["hyperparameters"] ] @use_named_args(mlp_parameters_space) def fitness_mlp(kernel_space_1, units_2, kernel_space_2, dense_1, dense_2): es = EarlyStopping(monitor='val_loss', patience=Config.get("ESValPatience"), min_delta=Config.get("ESValMinDelta"), baseline=0.2) model, hist = train_bayesian_cnn(X_train_int, y_train_int, (X_val, y_val), es, kernel_space_1, units_2, kernel_space_2, dense_1, dense_2) val_auprc = hist.history['val_auprc'][-1] print() print("Validation Loss: {}".format(val_auprc)) print() return -val_auprc print() print("Importing Epigenetic data...") print() X, y, features_size = filter_by_tasks(*import_sequence_dataset( "data", gene), Config.get("task"), perc=Config.get("samplePerc")) print("Datasets length: {}, {}".format(len(X), len(y))) print("Features sizes: {}".format(features_size)) metrics = {'losses': [], 'auprc': [], 'auroc': []} delta_stopper = DeltaYStopper(n_best=BOconfig["n_best"], delta=BOconfig["delta"]) for ext_holdout in range(Config.get("nExternalHoldout")): print() print("{}/{} EXTERNAL HOLDOUTS".format(ext_holdout, Config.get("nExternalHoldout"))) print() X_train, X_test, y_train, y_test = split(X, y, random_state=42, proportions=None, mode=mode) # Internal holdouts X_train_int, X_val, y_train_int, y_val = split(X_train, y_train, random_state=42, proportions=None, mode=mode) print("Searching Parameters...") print() min_res = gp_minimize(func=fitness_mlp, dimensions=mlp_parameters_space, acq_func=BOconfig["acq_function"], callback=[delta_stopper], n_calls=BOconfig["nBayesianOptCall"]) print() print("Training with best parameters found: {}".format(min_res.x)) print() print(X_train) es = EarlyStopping(monitor='val_loss', patience=Config.get("ESPatience"), min_delta=Config.get("ESMinDelta")) model, _ = train_bayesian_cnn(X_train, y_train, None, es, min_res.x[0], min_res.x[1], min_res.x[2], min_res.x[3]) eval_score = model.evaluate(X_test, y_test) # K.clear_session() print("Metrics names: ", model.metrics_names) print("Final Scores: ", eval_score) metrics['losses'].append(eval_score[0]) metrics['auprc'].append(eval_score[1]) metrics['auroc'].append(eval_score[2]) return metrics
def bayesian_mlp_exp(gene, mode): BOconfig = Config.get("bayesianOpt") hidden_layers_comb = get_hidden_layers_combinations( BOconfig["hiddenLayers"], 3, BOconfig["allowFirstLevelZero"]) mlp_parameters_space = get_parameters_space(hidden_layers_comb) @use_named_args(mlp_parameters_space) def fitness_mlp(learning_rate, num_hidden_layer, hidden_layer_choice): model, hist = train_bayesian_mlp(X_train_int, y_train_int, (X_val, y_val), features_size, learning_rate, num_hidden_layer, hidden_layer_choice, hidden_layers_comb) val_auprc = hist.history['val_auprc'][-1] print() print("Validation Loss: {}".format(val_auprc)) print() return -val_auprc print() print("Importing Epigenetic data...") print() X, y, features_size = filter_by_tasks(*import_epigenetic_dataset("data", gene), Config.get("task"), perc=Config.get("samplePerc")) print("Datasets length: {}, {}".format(len(X), len(y))) print("Features sizes: {}".format(features_size)) metrics = {'losses': [], 'auprc': [], 'auroc': []} delta_stopper = DeltaYStopper(n_best=BOconfig["n_best"], delta=BOconfig["delta"]) for ext_holdout in range(Config.get("nExternalHoldout")): print() print("{}/{} EXTERNAL HOLDOUTS".format(ext_holdout, Config.get("nExternalHoldout"))) print() X_train, X_test, y_train, y_test = split(X, y, random_state=42, proportions=None, mode=mode) # Internal holdouts X_train_int, X_val, y_train_int, y_val = split(X_train, y_train, random_state=42, proportions=None, mode=mode) print("Searching Parameters...") print() min_res = gp_minimize(func=fitness_mlp, dimensions=mlp_parameters_space, acq_func=BOconfig["acq_function"], callback=[delta_stopper], n_calls=BOconfig["nBayesianOptCall"]) print() print("Training with best parameters found: {}".format(min_res.x)) print() print(X_train) model, _ = train_bayesian_mlp(X_train, y_train, None, features_size, min_res.x[0], min_res.x[1], min_res.x[2], hidden_layers_comb) eval_score = model.evaluate(X_test, y_test) #K.clear_session() print("Metrics names: ", model.metrics_names) print("Final Scores: ", eval_score) metrics['losses'].append(eval_score[0]) metrics['auprc'].append(eval_score[1]) metrics['auroc'].append(eval_score[2]) return metrics