Esempio n. 1
0
 def _fit(self,
          n_calls: int = 100,
          n_random_starts: int = 10,
          n_restarts_optimizer: int = 50,
          random_state: int = 42,
          early_stopping_delta: float = 0.001,
          early_stopping_best_models: int = 5,
          n_jobs: int = -1):
     """Minimize the function score."""
     self._space.rasterize()
     history = History(self._space, self._maximization_problem)
     results = gp_minimize(func=self._decorate_score(self._score),
                           dimensions=self._space.space,
                           n_calls=n_calls,
                           n_random_starts=n_random_starts,
                           n_jobs=n_jobs,
                           n_restarts_optimizer=n_restarts_optimizer,
                           callback=[
                               TQDMGaussianProcess(n_calls), history,
                               DeltaYStopper(
                                   early_stopping_delta,
                                   n_best=early_stopping_best_models)
                           ],
                           random_state=random_state)
     self._best_parameters = self._space.inflate_results(results)
     return history
Esempio n. 2
0
def test_deltay_stopper():
    deltay = DeltaYStopper(0.2, 3)

    Result = namedtuple('Result', ['func_vals'])

    assert deltay(Result([0, 1, 2, 3, 4, 0.1, 0.19]))
    assert not deltay(Result([0, 1, 2, 3, 4, 0.1]))
    assert deltay(Result([0, 1])) is None
def get_bayes_scikit_score_cv(X_train,y_train,X_test,y_test, X_val=None, y_val= None, max_evals = 25, folds=5, original = None):

    space = get_baesian_space(dictem = True)
    opt_cat = BayesSearchCV(CatBoostClassifier(logging_level='Silent'), space['CAT'], n_iter = max_evals, random_state = 0)
    opt_xgb = BayesSearchCV(XGBClassifier(), space['XGB'], n_iter = max_evals, random_state = 0)
    opt_lgbm = BayesSearchCV(LGBMClassifier(), space['LGBM'], n_iter = max_evals, random_state = 0)
    _ = opt_cat.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    __ = opt_xgb.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    ___ = opt_lgbm.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])

    scores = [opt_cat.score(X_test, y_test), opt_xgb.score(X_test, y_test), opt_lgbm.score(X_test, y_test)]
    train_scores  = [opt_cat.best_score_, opt_xgb.best_score_, opt_lgbm.best_score_]
    score = max(scores)
    cross_score = max(train_scores)
    neptune.log_metric(f'skopt-{max_evals}-iterations-{folds}-folds', score)
    neptune.log_metric('skopt train holdout score', cross_score)
    return score
Esempio n. 4
0
    def optimize(self,
                 loss: BaseLoss,
                 n_calls: int = 100,
                 max_calls: int = 100,
                 n_random_starts: int = 30,
                 use_cache=True,
                 initial_parameter_points: Optional[np.array] = None,
                 delta: float = 0.0,
                 random_state: int = 1,
                 verbose=True,
                 **kwargs):
        result = self.load_cached_result(loss)

        x0, y0 = get_initial_points(result)
        if len(initial_parameter_points) > 0 and x0 is not None:
            x0 += initial_parameter_points

        # ensure max_calls is respected over past calls
        n_past_calls = 0 if x0 is None else len(x0)
        n_calls = min(n_calls, max_calls - n_past_calls)

        # skopt minimize requires at least the n_random_starts
        if n_calls < n_random_starts:
            print(
                "Returning due to few remaining calls (n_calls < n_random_starts)"
            )
            return result

        opts = dict(dimensions=self.dimensions,
                    acq_func=self.acq_func,
                    n_calls=n_calls,
                    n_random_starts=n_random_starts,
                    callback=[DeltaYStopper(delta)],
                    x0=x0,
                    y0=y0,
                    n_jobs=-1,
                    random_state=random_state,
                    verbose=verbose)
        opts.update(kwargs)

        result = self.skopt_minimize(loss, **opts)

        if use_cache:
            self.cache_result(loss, result)

        return result
    def _set_callbacks(self, names=['saver', 'stopper', 'timer', 'verb']):
        os.makedirs(self.chkpoint_root, exist_ok=True)

        checkpoint_saver = CheckpointSaver(self.tune_name + '.pkl',
                                           compress=9,
                                           store_objective=False)
        stopper = DeltaYStopper(self.params['tune']['stop_margin'],
                                self.params['tune']['stopper_patience'])
        timer = TimerCallback()
        verb = VerboseCallback(n_total=1)

        callbacks = {
            'saver': checkpoint_saver,
            'stopper': stopper,
            'timer': timer,
            'verb': verb
        }

        self.callbacks = [callbacks[x] for x in names]
Esempio n. 6
0
                            mode='u')

                        if task[0][
                                'name'] != "A-E+A-P":  #TODO: search for a more elegant solution
                            X_train_int, y_train_int = filter_by_tasks(
                                X_train_int, y_train_int, task)
                            X_val, y_val = filter_by_tasks(X_val, y_val, task)

                        root_logger.debug(
                            "Internal Train size: {}, Validation size: {}".
                            format(len(X_train_int), len(X_val)))

                        root_logger.debug(
                            "BAYESIAN OPTIMIZER - Started to search params")

                        delta_stopper = DeltaYStopper(
                            n_best=BOconfig["n_best"], delta=BOconfig["delta"])
                        min_res = gp_minimize(
                            func=fitness,
                            dimensions=mlp_parameters_space,
                            acq_func=BOconfig["acq_function"],
                            callback=[delta_stopper],
                            n_calls=BOconfig["nBayesianOptCall"])

                        root_logger.debug(
                            "BAYESIAN OPTIMIZER - Best parameters found: {}".
                            format(min_res.x))

                        if experiment == "bayesianMLP":
                            hidden_layers_comb = get_hidden_layers_combinations(
                                BOconfig["hiddenLayers"], 3,
                                BOconfig["allowFirstLevelZero"])
Esempio n. 7
0
def get_params_SKopt(model, X, Y, space, cv_search, alg = 'catboost', cat_features = None, eval_dataset = None, UBM = False, opt_method =
                     'gbrt_minimize', verbose = True,  multi = False, scoring = 'neg_mean_squared_error', n_best = 50, total_time = 7200):
    """The method performs parameters tuning of an algorithm using scikit-optimize library.
    Parameters:
    1.
    2.
    3. multi - boolean, is used when a multioutput algorithm is tuned
    UPDATES:
    1. In this current version, the support of the catboost algorithms is added
    """
    if alg == 'catboost':
        fitparam = { 'eval_set' : eval_dataset,
                     'use_best_model' : UBM,
                     'cat_features' : cat_features,
                     'early_stopping_rounds': 20 }
    else:
        fitparam = {}
        
    @use_named_args(space)
    def objective(**params):
        model.set_params(**params)
        return -np.mean(cross_val_score(model, 
                                        X, Y, 
                                        cv=cv_search, 
                                        scoring= scoring,
                                        fit_params=fitparam))
    
    if opt_method == 'gbrt_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = gbrt_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [DeltaYStopper(delta = 0.01, n_best = 5), RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        

    elif opt_method == 'forest_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = forest_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        
    elif opt_method == 'gp_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'gp_hedge',}        
        
        reg_gp = gp_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
    
    TUNED_PARAMS = {} 
    for i, item in enumerate(space):
        if multi:
            TUNED_PARAMS[item.name.split('__')[1]] = reg_gp.x[i]
        else:
            TUNED_PARAMS[item.name] = reg_gp.x[i]
    
    return [TUNED_PARAMS,reg_gp]
Esempio n. 8
0
    ])
    df = pd.read_csv(filename, index_col=0)
    score = int(-df["mean"].values.flatten()[0])
    return score


space = Space({
    "oblivion": (0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    "mu_memory": [0, 1000],
    "mu_data": [0, 1000]
})

n_calls = 1000
gp = GaussianProcess(score, space)
with Notipy():
    results = gp.minimize(n_calls=n_calls,
                          n_random_starts=10,
                          callback=[
                              TQDMGaussianProcess(n_calls=n_calls),
                              DeltaYStopper(**{
                                  "delta": 1,
                                  "n_best": 100
                              })
                          ],
                          random_state=42,
                          n_jobs=cpu_count())

print(gp.best_parameters)
pd.DataFrame(dict(gp.best_parameters),
             index=[0]).to_json("best_parameters.json")
Esempio n. 9
0
    def _check_parameters(self):
        """Check the validity of the input parameters."""
        if self.mapping is None:
            self.mapping = {str(v): v for v in sorted(self.y.unique())}

        if self.scaled is None:
            self.scaled = check_scaling(self.X)

        # Create model subclasses ================================== >>

        models = []
        for m in self._models:
            if isinstance(m, str):
                acronym = get_acronym(m, must_be_equal=False)

                # Check if packages for non-sklearn models are available
                if acronym in OPTIONAL_PACKAGES:
                    try:
                        importlib.import_module(OPTIONAL_PACKAGES[acronym])
                    except ImportError:
                        raise ValueError(
                            f"Unable to import the {OPTIONAL_PACKAGES[acronym]} "
                            "package. Make sure it is installed.")

                # Check for regression/classification-only models
                if self.goal.startswith("class") and acronym in ONLY_REG:
                    raise ValueError(
                        f"The {acronym} model can't perform classification tasks!"
                    )
                elif self.goal.startswith("reg") and acronym in ONLY_CLASS:
                    raise ValueError(
                        f"The {acronym} model can't perform regression tasks!")

                models.append(MODEL_LIST[acronym](self,
                                                  acronym + m[len(acronym):]))

            elif not isinstance(m, BaseModel):  # Model is custom estimator
                models.append(CustomModel(self, estimator=m))

            else:  # Model is already a model subclass (can happen with reruns)
                models.append(m)

        self._models = CustomDict({m.name: m for m in models})

        # Check validity metric ==================================== >>

        if None in self._metric:
            self._metric = CustomDict(get_default_metric(self.task))

        # Ignore if it's the same metric as previous call
        elif not all([hasattr(m, "name") for m in self._metric]):
            self._metric = self._prepare_metric(
                metric=self._metric,
                greater_is_better=self.greater_is_better,
                needs_proba=self.needs_proba,
                needs_threshold=self.needs_threshold,
            )

        # Check validity sequential parameters ===================== >>

        for param in ["n_calls", "n_initial_points", "bagging"]:
            p = lst(getattr(self, param))
            if len(p) != 1 and len(p) != len(self._models):
                raise ValueError(
                    f"Invalid value for the {param} parameter. Length "
                    "should be equal to the number of models, got len"
                    f"(models)={len(self._models)} and len({param})={len(p)}.")

            for i, model in enumerate(self._models):
                if param in ("n_calls", "bagging") and p[i % len(p)] < 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >=0, got {p[i % len(p)]}.")
                elif param == "n_initial_points" and p[i % len(p)] <= 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >0, got {p[i % len(p)]}.")

                setattr(model, "_" + param, p[i % len(p)])

        # Prepare bo parameters ===================================== >>

        # Choose a base estimator (GP is chosen as default)
        self._base_estimator = self.bo_params.get("base_estimator", "GP")
        if isinstance(self._base_estimator, str):
            if self._base_estimator.lower() not in ("gp", "et", "rf", "gbrt"):
                raise ValueError(
                    f"Invalid value for the base_estimator parameter, got "
                    f"{self._base_estimator}. Value should be one of: 'GP', "
                    f"'ET', 'RF', 'GBRT'.")

        if self.bo_params.get("callbacks"):
            self._callbacks = lst(self.bo_params["callbacks"])

        if "max_time" in self.bo_params:
            if self.bo_params["max_time"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >0, got {self.bo_params['max_time']}.")
            self._callbacks.append(DeadlineStopper(self.bo_params["max_time"]))

        if "delta_x" in self.bo_params:
            if self.bo_params["delta_x"] < 0:
                raise ValueError(
                    "Invalid value for the delta_x parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_x']}.")
            self._callbacks.append(DeltaXStopper(self.bo_params["delta_x"]))

        if "delta_y" in self.bo_params:
            if self.bo_params["delta_y"] < 0:
                raise ValueError(
                    "Invalid value for the delta_y parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_y']}.")
            self._callbacks.append(
                DeltaYStopper(self.bo_params["delta_y"], n_best=5))

        if self.bo_params.get("plot"):
            self._callbacks.append(PlotCallback(self))

        if "cv" in self.bo_params:
            if self.bo_params["cv"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >=0, got {self.bo_params['cv']}.")
            self._cv = self.bo_params["cv"]

        if "early_stopping" in self.bo_params:
            if self.bo_params["early_stopping"] <= 0:
                raise ValueError(
                    "Invalid value for the early_stopping parameter. "
                    f"Value should be >=0, got {self.bo_params['early_stopping']}."
                )
            self._early_stopping = self.bo_params["early_stopping"]

        # Add custom dimensions to every model subclass
        if self.bo_params.get("dimensions"):
            for name, model in self._models.items():
                # If not dict, the dimensions are for all models
                if not isinstance(self.bo_params["dimensions"], dict):
                    model._dimensions = self.bo_params["dimensions"]
                else:
                    # Dimensions for every specific model
                    for key, value in self.bo_params["dimensions"].items():
                        # Parameters for this model only
                        if key.lower() == name:
                            model._dimensions = value
                            break

        kwargs = [
            "base_estimator",
            "max_time",
            "delta_x",
            "delta_y",
            "early_stopping",
            "cv",
            "callbacks",
            "dimensions",
            "plot",
        ]

        # The remaining bo_params are added as kwargs to the optimizer
        self._bo_kwargs = {
            k: v
            for k, v in self.bo_params.items() if k not in kwargs
        }

        # Prepare est_params ======================================= >>

        if self.est_params:
            for name, model in self._models.items():
                params = {}
                for key, value in self.est_params.items():
                    # Parameters for this model only
                    if key.lower() == name:
                        params.update(value)
                    # Parameters for all models
                    elif key.lower() not in self._models.keys():
                        params.update({key: value})

                for key, value in params.items():
                    if key.endswith("_fit"):
                        model._est_params_fit[key[:-4]] = value
                    else:
                        model._est_params[key] = value
def bayesian_cnn_exp(gene, mode):
    BOconfig = Config.get("bayesianOpt")
    mlp_parameters_space = [
        conf_to_params(conf)
        for conf in Config.get("bayesianOpt")["hyperparameters"]
    ]

    @use_named_args(mlp_parameters_space)
    def fitness_mlp(kernel_space_1, units_2, kernel_space_2, dense_1, dense_2):
        es = EarlyStopping(monitor='val_loss',
                           patience=Config.get("ESValPatience"),
                           min_delta=Config.get("ESValMinDelta"),
                           baseline=0.2)

        model, hist = train_bayesian_cnn(X_train_int, y_train_int,
                                         (X_val, y_val), es, kernel_space_1,
                                         units_2, kernel_space_2, dense_1,
                                         dense_2)

        val_auprc = hist.history['val_auprc'][-1]
        print()
        print("Validation Loss: {}".format(val_auprc))
        print()
        return -val_auprc

    print()
    print("Importing Epigenetic data...")
    print()
    X, y, features_size = filter_by_tasks(*import_sequence_dataset(
        "data", gene),
                                          Config.get("task"),
                                          perc=Config.get("samplePerc"))

    print("Datasets length: {}, {}".format(len(X), len(y)))
    print("Features sizes: {}".format(features_size))

    metrics = {'losses': [], 'auprc': [], 'auroc': []}
    delta_stopper = DeltaYStopper(n_best=BOconfig["n_best"],
                                  delta=BOconfig["delta"])

    for ext_holdout in range(Config.get("nExternalHoldout")):
        print()
        print("{}/{} EXTERNAL HOLDOUTS".format(ext_holdout,
                                               Config.get("nExternalHoldout")))
        print()
        X_train, X_test, y_train, y_test = split(X,
                                                 y,
                                                 random_state=42,
                                                 proportions=None,
                                                 mode=mode)

        # Internal holdouts
        X_train_int, X_val, y_train_int, y_val = split(X_train,
                                                       y_train,
                                                       random_state=42,
                                                       proportions=None,
                                                       mode=mode)

        print("Searching Parameters...")
        print()

        min_res = gp_minimize(func=fitness_mlp,
                              dimensions=mlp_parameters_space,
                              acq_func=BOconfig["acq_function"],
                              callback=[delta_stopper],
                              n_calls=BOconfig["nBayesianOptCall"])

        print()
        print("Training with best parameters found: {}".format(min_res.x))
        print()
        print(X_train)

        es = EarlyStopping(monitor='val_loss',
                           patience=Config.get("ESPatience"),
                           min_delta=Config.get("ESMinDelta"))
        model, _ = train_bayesian_cnn(X_train, y_train, None, es, min_res.x[0],
                                      min_res.x[1], min_res.x[2], min_res.x[3])

        eval_score = model.evaluate(X_test, y_test)
        # K.clear_session()

        print("Metrics names: ", model.metrics_names)
        print("Final Scores: ", eval_score)
        metrics['losses'].append(eval_score[0])
        metrics['auprc'].append(eval_score[1])
        metrics['auroc'].append(eval_score[2])

    return metrics
def bayesian_mlp_exp(gene, mode):
    BOconfig = Config.get("bayesianOpt")

    hidden_layers_comb = get_hidden_layers_combinations(
        BOconfig["hiddenLayers"], 3, BOconfig["allowFirstLevelZero"])
    mlp_parameters_space = get_parameters_space(hidden_layers_comb)

    @use_named_args(mlp_parameters_space)
    def fitness_mlp(learning_rate, num_hidden_layer, hidden_layer_choice):

        model, hist = train_bayesian_mlp(X_train_int, y_train_int, (X_val, y_val), features_size,
                                         learning_rate, num_hidden_layer, hidden_layer_choice, hidden_layers_comb)

        val_auprc = hist.history['val_auprc'][-1]
        print()
        print("Validation Loss: {}".format(val_auprc))
        print()
        return -val_auprc

    print()
    print("Importing Epigenetic data...")
    print()
    X, y, features_size = filter_by_tasks(*import_epigenetic_dataset("data", gene), Config.get("task"),
                                          perc=Config.get("samplePerc"))

    print("Datasets length: {}, {}".format(len(X), len(y)))
    print("Features sizes: {}".format(features_size))

    metrics = {'losses': [], 'auprc': [], 'auroc': []}
    delta_stopper = DeltaYStopper(n_best=BOconfig["n_best"], delta=BOconfig["delta"])

    for ext_holdout in range(Config.get("nExternalHoldout")):
        print()
        print("{}/{} EXTERNAL HOLDOUTS".format(ext_holdout, Config.get("nExternalHoldout")))
        print()
        X_train, X_test, y_train, y_test = split(X, y, random_state=42, proportions=None, mode=mode)

        # Internal holdouts
        X_train_int, X_val, y_train_int, y_val = split(X_train, y_train, random_state=42, proportions=None, mode=mode)

        print("Searching Parameters...")
        print()

        min_res = gp_minimize(func=fitness_mlp,
                              dimensions=mlp_parameters_space,
                              acq_func=BOconfig["acq_function"],
                              callback=[delta_stopper],
                              n_calls=BOconfig["nBayesianOptCall"])

        print()
        print("Training with best parameters found: {}".format(min_res.x))
        print()
        print(X_train)

        model, _ = train_bayesian_mlp(X_train, y_train, None, features_size,
                                      min_res.x[0], min_res.x[1], min_res.x[2], hidden_layers_comb)

        eval_score = model.evaluate(X_test, y_test)
        #K.clear_session()

        print("Metrics names: ", model.metrics_names)
        print("Final Scores: ", eval_score)
        metrics['losses'].append(eval_score[0])
        metrics['auprc'].append(eval_score[1])
        metrics['auroc'].append(eval_score[2])

    return metrics