def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None: if trial.state == optuna.trial.TrialState.PRUNED: self._consequtive_pruned_count += 1 else: self._consequtive_pruned_count = 0 if self._consequtive_pruned_count >= self.threshold: study.stop()
def _log_study_progress_and_save_best_model( study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None: # Collect study and trial data # trial_index = study.trials.index(trial) # best_trial_index = study.trials.index(study.best_trial) value = trial.value params = trial.params duration = (trial.datetime_complete - trial.datetime_start).total_seconds() # Log information about this trial logger.debug( f"Trial {trial.number} finished with value: {value} and parameters: {params}." f"Best trial is {study.best_trial.number}. Iteration took {duration} s" ) # If this trial is the best save the model as a study attribute if study.best_trial.number == trial.number: study.set_user_attr(key="best_model", value=trial.user_attrs["model"])
def _get_observation_pairs( study: optuna.study.Study, param_name: str, ) -> Tuple[List[Optional[float]], List[List[float]]]: """Get observation pairs from the study. This function collects observation pairs from the complete trials of the study. Pruning is currently not supported. The values for trials that don't contain the parameter named ``param_name`` are set to None. Objective values are negated if their directions are maximization and all objectives are treated as minimization in the MOTPE algorithm. """ trials = study.get_trials(deepcopy=False, states=(optuna.trial.TrialState.COMPLETE,)) values = [] scores = [] for trial in trials: param_value = None # type: Optional[float] if param_name in trial.params: distribution = trial.distributions[param_name] param_value = distribution.to_internal_repr(trial.params[param_name]) # Convert all objectives to minimization score = [ cast(float, v) if d == StudyDirection.MINIMIZE else -cast(float, v) for d, v in zip(study.directions, trial.values) ] values.append(param_value) scores.append(score) return values, scores
def _should_log_plots(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial): if not len(study.get_trials(states=(optuna.trial.TrialState.COMPLETE,))): return False elif self._plots_update_freq == 'never': return False else: if trial._trial_id % self._plots_update_freq == 0: return True return False
def update_trial_time(study: optuna.study.Study, trial: optuna.trial.FrozenTrial): """Callback for number of iteration with time cut-off. Args: study: optuna study object. trial: optuna trial object. """ ml_algo.mean_trial_time = study.trials_dataframe( )['duration'].mean().total_seconds() self.estimated_n_trials = min( self.n_trials, self.timeout // ml_algo.mean_trial_time)
def update_trial_time(study: optuna.study.Study, trial: optuna.trial.FrozenTrial): """Callback for number of iteration with time cut-off. Args: study: Optuna study object. trial: Optuna trial object. """ ml_algo.mean_trial_time = study.trials_dataframe()["duration"].mean().total_seconds() self.estimated_n_trials = min(self.n_trials, self.timeout // ml_algo.mean_trial_time) logger.info3( f"\x1b[1mTrial {len(study.trials)}\x1b[0m with hyperparameters {trial.params} scored {trial.value} in {trial.duration}" )
def callback(study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None: if trial.number >= 4: study.stop()
def search_hyperparams_by_optuna(optuna_study: optuna.study.Study, model: object, X: np.ndarray, Y: np.ndarray, n_trials: int = 100, iters: int = None, X_test: np.ndarray = None, Y_test: np.ndarray = None, dict_param="auto", tuning_eval: str = "rmse", split_params: dict = { "n_splits": 1, "y_type": "cls", "weight": "balance", "is_bootstrap": False }, fit_params: dict = {}, eval_params: dict = {}, n_jobs: int = 1) -> ( pd.DataFrame, dict, ): """ optuna でパラメータ探索 Params:: list_ouptut: 途中のデータも見たいので引数で空リスト n_trials: optuna で探索する回数 iter: 木の深さ固定などで探索したい場合は指定する X: 訓練データ Y: 訓練正解ラベル X_test: テストデータ Y_test: テスト正解ラベル model: ハイパーパラメータサーチしたいモデル dict_param = {"learning_rate":["category",0.05,0.1,0.2,0.3,0.5], "n_estimators":["int",500,1500], "max_depth":["int",3,10], "gamma":["float",0.,0.3], "min_child_weight":["int",1,20], "subsample":["step", 0.5, 0.9, 0.1], "colsample_bytree":["step", 0.1, 0.9, 0.1], "random_state":["const",1], "n_jobs":["const", -1]} tuning_eval: kkpackage.util.features.evalate にあるtuning_eval split_params: この設定に従ってsplit_data_balanceでデータを分割もしくはアンダーオーバーサンプリングする。 X_test, Y_test がNoneでなければ検証データはこちらが優先されるが、Noneの場合 split_data_balanceのindexを訓練と検証に使用して交差検証を行う fit_params: fit 時のパラメータ eval_params: 評価時のパラメータ """ if (type(dict_param) == str) and dict_param == "auto": ## 目的変数が2クラスか多クラスなのかを自動判断しておく bool_class_binary = True if is_classification_model(model): if np.unique(Y).shape[0] <= 2: bool_class_binary = True else: bool_class_binary = False if str(type(model)).find("lightgbm.sklearn.LGBMClassifier") >= 0: dict_param = { "boosting_type": ["category", "gbdt", "dart"], #"goss" "num_leaves": ["int", 10, 1500], "max_depth": ["const", -1], "learning_rate": ["const", 0.03], "n_estimators": ["const", 5000], "subsample_for_bin": ["const", 200000], ## 必要に応じて変更する ‘binary’ or ‘multiclass' "objective": [ "const", ("binary" if bool_class_binary == True else "multiclass") ], "class_weight": ["const", "balanced"], "min_child_weight": ["category"] + [0.01 * (2**i) for i in range(23)], "min_child_samples": ["int", 1, 1000], "subsample": ["step", 0.01, 0.99, 0.01], "colsample_bytree": ["step", 0.001, 0.99, 0.001], "reg_alpha": ["category", 0] + [0.01 * (2**i) for i in range(23)], "reg_lambda": ["category", 0] + [0.01 * (2**i) for i in range(23)], "random_state": ["const", 1], "n_jobs": ["const", n_jobs] } elif str(type(model)).find("LGBMRegressor") >= 0: dict_param = { "boosting_type": ["const", "gbdt"], "num_leaves": ["int", 10, 1000], "max_depth": ["const", -1], "learning_rate": ["const", 0.3], "n_estimators": ["const", 3000], "subsample_for_bin": ["const", 200000], "objective": ["const", "regression"], "class_weight": ["const", None], "min_child_weight": ["float", 0, 100], "min_child_samples": ["int", 1, 100], "subsample": ["step", 0.01, 0.99, 0.01], "colsample_bytree": ["step", 0.01, 0.99, 0.01], "reg_lambda": ["float", 0, 100], "random_state": ["const", 1], "n_jobs": ["const", n_jobs] } elif str(type(model)).find("xgboost.sklearn.XGBClassifier") >= 0: dict_param = { "n_estimators": ["const", 3000], "max_depth": ["int", 3, 15], "learning_rate": ["const", 0.1], "objective": [ "const", ("binary:logistic" if bool_class_binary == True else "multi:softmax") ], "booster": ["category", "gbtree", "dart"], "n_jobs": ["const", n_jobs], "gamma": ["category", 0] + [0.01 * (2**i) for i in range(16)], "min_child_weight": ["category", 0] + [0.01 * (2**i) for i in range(16)], "max_delta_step": ["const", 0], "subsample": ["step", 0.01, 0.99, 0.01], "colsample_bytree": ["step", 0.01, 0.99, 0.01], "colsample_bylevel": ["const", 1], "colsample_bynode": ["const", 1], "reg_alpha": ["category", 0] + [0.01 * (2**i) for i in range(16)], "reg_lambda": ["category", 0] + [0.01 * (2**i) for i in range(16)], "random_state": ["const", 1] } elif str(type(model)).find("HistGradientBoostingClassifier") >= 0: dict_param = { "loss": [ "category", "binary_crossentropy", "categorical_crossentropy" ], "learning_rate": ["category", 0.05, 0.1, 0.2, 0.3, 0.5], "max_iter": ["int", 100, 3000], "max_leaf_nodes": ["int", 10, 1000], "max_depth": ["const", None], "min_samples_leaf": ["int", 1, 100], "max_bins": ["const", 256], "tol": ["float", 1e-8, 1e-1], "random_state": ["const", 1] } elif str(type(model)).find("RandomForestClassifier") >= 0: dict_param = { "n_estimators": ["int", 100, 3000], "criterion": ["category", "gini", "entropy"], "max_depth": ["int", 2, 10], "min_samples_split": ["int", 1, 100], "min_samples_leaf": ["int", 1, 100], "max_features": [ "category", "auto", "log2", None, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5 ], "max_leaf_nodes": ["const", None], "bootstrap": ["const", True], "random_state": ["const", 1], "n_jobs": ["const", n_jobs] } elif str(type(model)).find("CatBoostClassifier") >= 0: dict_param = { "objective": ["const", "MultiClass"], "custom_metric": ["const", "MultiClass"], "eval_metric": ["const", "MultiClass"], "n_estimators": ["const", 5000], "learning_rate": ["const", 0.05], "random_seed": ["const", 1], "reg_lambda": ["float", 0, 100], #"subsample" :["step", 0.1, 1.0, 0.1], "max_depth": ["int", 2, 8], #"min_child_samples" :["int", 1, 100], "nan_mode": ["const", "Min"], "task_type": ["const", "GPU"], "devices": ["const", '0'] } else: logger.raise_error( f"not supported model !! model:{str(type(model))}") # 深さを固定する if iters is not None: for x in ["n_estimators", "max_iter"]: if dict_param.get(x) is not None: dict_param[x] = ["const", iters] break # データの数を変えながら探索する df_optuna = pd.DataFrame() # 関数を埋め込む f = partial(optuna_base_function, X, Y, X_test, Y_test, model, dict_param, tuning_eval, split_params, fit_params, eval_params) # ハイパーパラメータ探索 optuna_study.optimize(f, n_trials=n_trials) # 結果を格納する for i_trial in optuna_study.trials: sewk = pd.Series(i_trial.params) sewk["value"] = i_trial.value df_optuna = df_optuna.append(sewk, ignore_index=True) # パラメータを作成する dict_param_ret = {} for key, val in dict_param.items(): if val[0] == "const": dict_param_ret[key] = val[-1] for key, val in optuna_study.best_params.items(): dict_param_ret[key] = val logger.info("END") return df_optuna, dict_param_ret