def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if trial.state == optuna.trial.TrialState.PRUNED:
            self._consequtive_pruned_count += 1
        else:
            self._consequtive_pruned_count = 0

        if self._consequtive_pruned_count >= self.threshold:
            study.stop()    
def _log_study_progress_and_save_best_model(
        study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
    # Collect study and trial data
    # trial_index = study.trials.index(trial)
    # best_trial_index = study.trials.index(study.best_trial)
    value = trial.value
    params = trial.params
    duration = (trial.datetime_complete - trial.datetime_start).total_seconds()
    # Log information about this trial
    logger.debug(
        f"Trial {trial.number} finished with value: {value} and parameters: {params}."
        f"Best trial is {study.best_trial.number}. Iteration took {duration} s"
    )
    # If this trial is the best save the model as a study attribute
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["model"])
def _get_observation_pairs(
    study: optuna.study.Study,
    param_name: str,
) -> Tuple[List[Optional[float]], List[List[float]]]:
    """Get observation pairs from the study.

    This function collects observation pairs from the complete trials of the study.
    Pruning is currently not supported.
    The values for trials that don't contain the parameter named ``param_name`` are set to None.

    Objective values are negated if their directions are maximization and all objectives are
    treated as minimization in the MOTPE algorithm.
    """

    trials = study.get_trials(deepcopy=False, states=(optuna.trial.TrialState.COMPLETE,))
    values = []
    scores = []
    for trial in trials:
        param_value = None  # type: Optional[float]
        if param_name in trial.params:
            distribution = trial.distributions[param_name]
            param_value = distribution.to_internal_repr(trial.params[param_name])

        # Convert all objectives to minimization
        score = [
            cast(float, v) if d == StudyDirection.MINIMIZE else -cast(float, v)
            for d, v in zip(study.directions, trial.values)
        ]

        values.append(param_value)
        scores.append(score)

    return values, scores
 def _should_log_plots(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial):
     if not len(study.get_trials(states=(optuna.trial.TrialState.COMPLETE,))):
         return False
     elif self._plots_update_freq == 'never':
         return False
     else:
         if trial._trial_id % self._plots_update_freq == 0:
             return True
     return False
Exemple #5
0
        def update_trial_time(study: optuna.study.Study,
                              trial: optuna.trial.FrozenTrial):
            """Callback for number of iteration with time cut-off.

            Args:
                study: optuna study object.
                trial: optuna trial object.
            """
            ml_algo.mean_trial_time = study.trials_dataframe(
            )['duration'].mean().total_seconds()
            self.estimated_n_trials = min(
                self.n_trials, self.timeout // ml_algo.mean_trial_time)
Exemple #6
0
        def update_trial_time(study: optuna.study.Study, trial: optuna.trial.FrozenTrial):
            """Callback for number of iteration with time cut-off.

            Args:
                study: Optuna study object.
                trial: Optuna trial object.

            """
            ml_algo.mean_trial_time = study.trials_dataframe()["duration"].mean().total_seconds()
            self.estimated_n_trials = min(self.n_trials, self.timeout // ml_algo.mean_trial_time)

            logger.info3(
                f"\x1b[1mTrial {len(study.trials)}\x1b[0m with hyperparameters {trial.params} scored {trial.value} in {trial.duration}"
            )
Exemple #7
0
 def callback(study: optuna.study.Study,
              trial: optuna.trial.FrozenTrial) -> None:
     if trial.number >= 4:
         study.stop()
Exemple #8
0
def search_hyperparams_by_optuna(optuna_study: optuna.study.Study,
                                 model: object,
                                 X: np.ndarray,
                                 Y: np.ndarray,
                                 n_trials: int = 100,
                                 iters: int = None,
                                 X_test: np.ndarray = None,
                                 Y_test: np.ndarray = None,
                                 dict_param="auto",
                                 tuning_eval: str = "rmse",
                                 split_params: dict = {
                                     "n_splits": 1,
                                     "y_type": "cls",
                                     "weight": "balance",
                                     "is_bootstrap": False
                                 },
                                 fit_params: dict = {},
                                 eval_params: dict = {},
                                 n_jobs: int = 1) -> (
                                     pd.DataFrame,
                                     dict,
                                 ):
    """
    optuna でパラメータ探索
    Params::
        list_ouptut: 途中のデータも見たいので引数で空リスト
        n_trials: optuna で探索する回数
        iter: 木の深さ固定などで探索したい場合は指定する
        X: 訓練データ
        Y: 訓練正解ラベル
        X_test: テストデータ
        Y_test: テスト正解ラベル
        model: ハイパーパラメータサーチしたいモデル
        dict_param = {"learning_rate":["category",0.05,0.1,0.2,0.3,0.5], "n_estimators":["int",500,1500], "max_depth":["int",3,10], 
                    "gamma":["float",0.,0.3], "min_child_weight":["int",1,20], "subsample":["step", 0.5, 0.9, 0.1], 
                    "colsample_bytree":["step", 0.1, 0.9, 0.1], "random_state":["const",1], "n_jobs":["const", -1]}
        tuning_eval: kkpackage.util.features.evalate にあるtuning_eval
        split_params:
            この設定に従ってsplit_data_balanceでデータを分割もしくはアンダーオーバーサンプリングする。
            X_test, Y_test がNoneでなければ検証データはこちらが優先されるが、Noneの場合
            split_data_balanceのindexを訓練と検証に使用して交差検証を行う
        fit_params: fit 時のパラメータ
        eval_params: 評価時のパラメータ
    """
    if (type(dict_param) == str) and dict_param == "auto":
        ## 目的変数が2クラスか多クラスなのかを自動判断しておく
        bool_class_binary = True
        if is_classification_model(model):
            if np.unique(Y).shape[0] <= 2: bool_class_binary = True
            else: bool_class_binary = False

        if str(type(model)).find("lightgbm.sklearn.LGBMClassifier") >= 0:
            dict_param = {
                "boosting_type": ["category", "gbdt", "dart"],  #"goss"
                "num_leaves": ["int", 10, 1500],
                "max_depth": ["const", -1],
                "learning_rate": ["const", 0.03],
                "n_estimators": ["const", 5000],
                "subsample_for_bin": ["const", 200000],
                ## 必要に応じて変更する ‘binary’ or ‘multiclass'
                "objective": [
                    "const",
                    ("binary" if bool_class_binary == True else "multiclass")
                ],
                "class_weight": ["const", "balanced"],
                "min_child_weight":
                ["category"] + [0.01 * (2**i) for i in range(23)],
                "min_child_samples": ["int", 1, 1000],
                "subsample": ["step", 0.01, 0.99, 0.01],
                "colsample_bytree": ["step", 0.001, 0.99, 0.001],
                "reg_alpha":
                ["category", 0] + [0.01 * (2**i) for i in range(23)],
                "reg_lambda":
                ["category", 0] + [0.01 * (2**i) for i in range(23)],
                "random_state": ["const", 1],
                "n_jobs": ["const", n_jobs]
            }
        elif str(type(model)).find("LGBMRegressor") >= 0:
            dict_param = {
                "boosting_type": ["const", "gbdt"],
                "num_leaves": ["int", 10, 1000],
                "max_depth": ["const", -1],
                "learning_rate": ["const", 0.3],
                "n_estimators": ["const", 3000],
                "subsample_for_bin": ["const", 200000],
                "objective": ["const", "regression"],
                "class_weight": ["const", None],
                "min_child_weight": ["float", 0, 100],
                "min_child_samples": ["int", 1, 100],
                "subsample": ["step", 0.01, 0.99, 0.01],
                "colsample_bytree": ["step", 0.01, 0.99, 0.01],
                "reg_lambda": ["float", 0, 100],
                "random_state": ["const", 1],
                "n_jobs": ["const", n_jobs]
            }
        elif str(type(model)).find("xgboost.sklearn.XGBClassifier") >= 0:
            dict_param = {
                "n_estimators": ["const", 3000],
                "max_depth": ["int", 3, 15],
                "learning_rate": ["const", 0.1],
                "objective": [
                    "const",
                    ("binary:logistic"
                     if bool_class_binary == True else "multi:softmax")
                ],
                "booster": ["category", "gbtree", "dart"],
                "n_jobs": ["const", n_jobs],
                "gamma": ["category", 0] + [0.01 * (2**i) for i in range(16)],
                "min_child_weight":
                ["category", 0] + [0.01 * (2**i) for i in range(16)],
                "max_delta_step": ["const", 0],
                "subsample": ["step", 0.01, 0.99, 0.01],
                "colsample_bytree": ["step", 0.01, 0.99, 0.01],
                "colsample_bylevel": ["const", 1],
                "colsample_bynode": ["const", 1],
                "reg_alpha":
                ["category", 0] + [0.01 * (2**i) for i in range(16)],
                "reg_lambda":
                ["category", 0] + [0.01 * (2**i) for i in range(16)],
                "random_state": ["const", 1]
            }
        elif str(type(model)).find("HistGradientBoostingClassifier") >= 0:
            dict_param = {
                "loss": [
                    "category", "binary_crossentropy",
                    "categorical_crossentropy"
                ],
                "learning_rate": ["category", 0.05, 0.1, 0.2, 0.3, 0.5],
                "max_iter": ["int", 100, 3000],
                "max_leaf_nodes": ["int", 10, 1000],
                "max_depth": ["const", None],
                "min_samples_leaf": ["int", 1, 100],
                "max_bins": ["const", 256],
                "tol": ["float", 1e-8, 1e-1],
                "random_state": ["const", 1]
            }
        elif str(type(model)).find("RandomForestClassifier") >= 0:
            dict_param = {
                "n_estimators": ["int", 100, 3000],
                "criterion": ["category", "gini", "entropy"],
                "max_depth": ["int", 2, 10],
                "min_samples_split": ["int", 1, 100],
                "min_samples_leaf": ["int", 1, 100],
                "max_features": [
                    "category", "auto", "log2", None, 0.01, 0.02, 0.05, 0.1,
                    0.2, 0.5
                ],
                "max_leaf_nodes": ["const", None],
                "bootstrap": ["const", True],
                "random_state": ["const", 1],
                "n_jobs": ["const", n_jobs]
            }
        elif str(type(model)).find("CatBoostClassifier") >= 0:
            dict_param = {
                "objective": ["const", "MultiClass"],
                "custom_metric": ["const", "MultiClass"],
                "eval_metric": ["const", "MultiClass"],
                "n_estimators": ["const", 5000],
                "learning_rate": ["const", 0.05],
                "random_seed": ["const", 1],
                "reg_lambda": ["float", 0, 100],
                #"subsample"           :["step", 0.1, 1.0, 0.1],
                "max_depth": ["int", 2, 8],
                #"min_child_samples"   :["int", 1, 100],
                "nan_mode": ["const", "Min"],
                "task_type": ["const", "GPU"],
                "devices": ["const", '0']
            }
        else:
            logger.raise_error(
                f"not supported model !! model:{str(type(model))}")
        # 深さを固定する
        if iters is not None:
            for x in ["n_estimators", "max_iter"]:
                if dict_param.get(x) is not None:
                    dict_param[x] = ["const", iters]
                    break
    # データの数を変えながら探索する
    df_optuna = pd.DataFrame()
    # 関数を埋め込む
    f = partial(optuna_base_function, X, Y, X_test, Y_test, model, dict_param,
                tuning_eval, split_params, fit_params, eval_params)
    # ハイパーパラメータ探索
    optuna_study.optimize(f, n_trials=n_trials)

    # 結果を格納する
    for i_trial in optuna_study.trials:
        sewk = pd.Series(i_trial.params)
        sewk["value"] = i_trial.value
        df_optuna = df_optuna.append(sewk, ignore_index=True)
    # パラメータを作成する
    dict_param_ret = {}
    for key, val in dict_param.items():
        if val[0] == "const": dict_param_ret[key] = val[-1]
    for key, val in optuna_study.best_params.items():
        dict_param_ret[key] = val

    logger.info("END")
    return df_optuna, dict_param_ret