def test_optuna_callback(self) -> None: params = {"verbose": -1} # type: Dict[str, Any] dataset = lgb.Dataset(np.zeros((10, 10))) callback_mock = mock.MagicMock() study = optuna.create_study() tuner = LightGBMTunerCV(params, dataset, study=study, optuna_callbacks=[callback_mock],) with mock.patch.object(_OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0]): tuner._tune_params(["num_leaves"], 10, optuna.samplers.TPESampler(), "num_leaves") assert callback_mock.call_count == 10
def test_inconsistent_study_direction(self, metric: str, study_direction: str) -> None: params = {} # type: Dict[str, Any] if metric is not None: params["metric"] = metric train_set = lgb.Dataset(None) valid_set = lgb.Dataset(None) study = optuna.create_study(direction=study_direction) with pytest.raises(ValueError) as excinfo: lgb.LightGBMTuner( params, train_set, valid_sets=[train_set, valid_set], num_boost_round=5, early_stopping_rounds=2, study=study, ) assert excinfo.type == ValueError assert str(excinfo.value).startswith( "Study direction is inconsistent with the metric")
def _learning_race_lgb(self, this_model_name, target): # テスト用のデータを評価用と検証用に分ける X_eval, X_valid, y_eval, y_valid = train_test_split(self.X_test, self.y_test, random_state=42) X_eval_weight = X_eval["weight"] X_eval = X_eval.drop("weight", axis=1) # データセットを生成する lgb_train = lgb.Dataset(self.X_train.drop("weight", axis=1), self.y_train, weight=self.X_train["weight"] ) #, categorical_feature=self.categ_columns) lgb_eval = lgb.Dataset( X_eval, y_eval, reference=lgb_train, weight=X_eval_weight) #, categorical_feature=self.categ_columns) if self.test_flag: num_boost_round = 5 early_stopping_rounds = 3 else: num_boost_round = 1000 early_stopping_rounds = 50 # 上記のパラメータでモデルを学習する best_params, history = {}, [] this_param = self.lgbm_params[target] model = lgb.train(this_param, lgb_train, valid_sets=lgb_eval, verbose_eval=False, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=history) print("Bset Paramss:", best_params) print('Tuning history:', history) self._save_learning_model(model, this_model_name)
def _get_tuner_object(self, params={}, train_set=None, kwargs_options={}, study=None): # type: (Dict[str, Any], lgb.Dataset, Dict[str, Any], Optional[Study]) -> lgb.LightGBMTuner # Required keyword arguments. dummy_dataset = lgb.Dataset(None) kwargs = dict( num_boost_round=5, early_stopping_rounds=2, valid_sets=dummy_dataset, study=study ) kwargs.update(kwargs_options) runner = lgb.LightGBMTuner(params, train_set, **kwargs) return runner
def test_tune_best_score_reproducibility(self) -> None: california = sklearn.datasets.fetch_california_housing() X_trainval, X_test, y_trainval, y_test = train_test_split( california.data, california.target, random_state=0) train = lgb.Dataset(X_trainval, y_trainval) valid = lgb.Dataset(X_test, y_test) params = { "objective": "regression", "metric": "rmse", "random_seed": 0, "deterministic": True, "force_col_wise": True, "verbosity": -1, } tuner_first_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, callbacks=[log_evaluation(-1)], ) tuner_first_try.run() best_score_first_try = tuner_first_try.best_score tuner_second_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, callbacks=[log_evaluation(-1)], ) tuner_second_try.run() best_score_second_try = tuner_second_try.best_score assert best_score_second_try == best_score_first_try
def test_tune_num_leaves_negative_max_depth(self) -> None: params: Dict[str, Any] = { "metric": "binary_logloss", "max_depth": -1, "verbose": -1 } X_trn = np.random.uniform(10, size=(10, 5)) y_trn = np.random.randint(2, size=10) train_dataset = lgb.Dataset(X_trn, label=y_trn) valid_dataset = lgb.Dataset(X_trn, label=y_trn) runner = lgb.LightGBMTuner( params, train_dataset, num_boost_round=3, early_stopping_rounds=2, valid_sets=valid_dataset, callbacks=[log_evaluation(-1)], ) runner.tune_num_leaves() assert len(runner.study.trials) == 20
def test_no_eval_set_args(self): # type: () -> None params = {} # type: Dict[str, Any] train_set = lgb.Dataset(None) with pytest.raises(ValueError) as excinfo: lgb.LightGBMTuner(params, train_set, num_boost_round=5, early_stopping_rounds=2) assert excinfo.type == ValueError assert str(excinfo.value) == "`valid_sets` is required."
def test_tune_num_leaves_negative_max_depth(self): # type: () -> None params = { "metric": "binary_logloss", "max_depth": -1, } # type: Dict[str, Any] X_trn = np.random.uniform(10, size=(10, 5)) y_trn = np.random.randint(2, size=10) train_dataset = lgb.Dataset(X_trn, label=y_trn) valid_dataset = lgb.Dataset(X_trn, label=y_trn) tuning_history = [] # type: List[Dict[str, float]] runner = lgb.LightGBMTuner( params, train_dataset, num_boost_round=3, early_stopping_rounds=2, valid_sets=valid_dataset, tuning_history=tuning_history, ) runner.tune_num_leaves() assert len(tuning_history) == 20
def test_deprecated_args( self, best_params: Optional[Dict[str, Any]], tuning_history: Optional[List[Dict[str, Any]]] ) -> None: # Required keyword arguments. params = {} # type: Dict[str, Any] train_set = lgb.Dataset(None) with pytest.warns(DeprecationWarning): lgb.LightGBMTuner( params, train_set, valid_sets=[train_set], best_params=best_params, tuning_history=tuning_history, )
def test_call(self): # type: () -> None target_param_names = ['lambda_l1'] lgbm_params = {} # type: Dict[str, Any] train_set = lgb.Dataset(None) val_set = lgb.Dataset(None) lgbm_kwargs = {'valid_sets': val_set} best_score = -np.inf with turnoff_train(): objective = OptunaObjective( target_param_names, lgbm_params, train_set, lgbm_kwargs, best_score, ) study = optuna.create_study(direction='minimize') study.optimize(objective, n_trials=10) assert study.best_value == 0.5
def test_tune_best_score_reproducibility(self) -> None: boston = sklearn.datasets.load_boston() X_trainval, X_test, y_trainval, y_test = train_test_split( boston.data, boston.target, random_state=0 ) train = lgb.Dataset(X_trainval, y_trainval) valid = lgb.Dataset(X_test, y_test) params = { "objective": "regression", "metric": "rmse", "random_seed": 0, "deterministic": True, "verbosity": -1, } tuner_first_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, ) tuner_first_try.run() best_score_first_try = tuner_first_try.best_score tuner_second_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, ) tuner_second_try.run() best_score_second_try = tuner_second_try.best_score assert best_score_second_try == best_score_first_try
def objective(trial): X, y = ember.read_vectorized_features('./sample/merge', 20000, 3154) train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.4, random_state=777) valid_x, test_x, valid_y, test_y = train_test_split(val_x, val_y, test_size=0.5, random_state=777) sc = StandardScaler() train_x = sc.fit_transform(train_x) valid_x = sc.transform(valid_x) test_x = sc.transform(test_x) train_data_set = lgb.Dataset(train_x, train_y) valid_data_sets = lgb.Dataset(valid_x, valid_y) param = { 'objective': 'binary', 'metric': 'binary_logloss', "verbosity": -1, "boosting_type": "gbdt", 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0), 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0), "num_leaves": trial.suggest_int("num_leaves", 2, 256), # 'num_leaves': 2048, # 전체 트리의 leave 수, 디폴트값 31 # 'max_depth': 16, # 트리 최대 깊이 # 'min_data_in_leaf': 1000, # 리프가 갖는 최소한의 레코드, 디폴트값은 20으로 최적의 값 # 'num_iterations': 1000, # 1000 -> 1500 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0), 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0), 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100) } gbm = lgb.train(param, train_data_set, valid_sets=[valid_data_sets], verbose_eval=False) pred_y = gbm.predict(test_x) y_pred = np.where(np.array(pred_y) > 0.7, 1, 0) accuracy = sklearn.metrics.accuracy_score(test_y, y_pred) return accuracy
def test_call(self) -> None: target_param_names = ["lambda_l1"] lgbm_params: Dict[str, Any] = {} train_set = lgb.Dataset(None) val_set = lgb.Dataset(None) lgbm_kwargs = {"valid_sets": val_set} best_score = -np.inf with turnoff_train(): objective = _OptunaObjective( target_param_names, lgbm_params, train_set, lgbm_kwargs, best_score, "tune_lambda_l1", None, ) study = optuna.create_study(direction="minimize") study.optimize(objective, n_trials=10) assert study.best_value == 0.5
def test_get_booster_best_score__using_valid_names_as_str(self) -> None: expected_value = 1.0 class DummyBooster(object): def __init__(self) -> None: self.best_score = {"dev": {"binary_logloss": expected_value}} booster = DummyBooster() dummy_dataset = lgb.Dataset(None) tuner = _BaseTuner(lgbm_kwargs={"valid_names": "dev", "valid_sets": dummy_dataset}) val_score = tuner._get_booster_best_score(booster) assert val_score == expected_value
def test_get_booster_best_score(self) -> None: expected_value = 1.0 class DummyBooster(object): def __init__(self) -> None: self.best_score = {"valid_0": {"binary_logloss": expected_value}} booster = DummyBooster() dummy_dataset = lgb.Dataset(None) tuner = _BaseTuner(lgbm_kwargs=dict(valid_sets=dummy_dataset)) val_score = tuner._get_booster_best_score(booster) assert val_score == expected_value
def test_run_show_progress_bar(self, show_progress_bar: bool, expected: int) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTunerCV( params, dataset, study=study, time_budget=1, show_progress_bar=show_progress_bar ) with mock.patch.object( _OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0] ), mock.patch("tqdm.tqdm") as mock_tqdm: tuner.run() assert mock_tqdm.call_count == expected
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # データのセット validation = va_x is not None lgb_train = optuna_lgb.Dataset(tr_x, tr_y, categorical_feature=self.categorical_features, free_raw_data=False) if validation: lgb_eval = optuna_lgb.Dataset(va_x, va_y, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False) # ハイパーパラメータの設定 params = dict(self.params) num_round = params.pop('num_boost_round') best_params, tuning_history = dict(), list() # 学習 if validation: early_stopping_rounds = params.pop('early_stopping_rounds') self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=1000, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=tuning_history ) else: self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=1000, best_params=best_params, tuning_history=tuning_history ) print('Best Params:', best_params) with open(f'../output/model/{self.run_fold_name}_best_params.json', 'w') as f: json.dump(best_params, f, indent=4, separators=(',', ': '))
def test_optuna_callback(self) -> None: params: Dict[str, Any] = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) callback_mock = mock.MagicMock() study = optuna.create_study() tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, optuna_callbacks=[callback_mock] ) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner._tune_params(["num_leaves"], 10, optuna.samplers.TPESampler(), "num_leaves") assert callback_mock.call_count == 10
def _single_train(features, targets, params): ''' train single column of target ''' trainval = lgb.Dataset(features, targets) tuner = lgb.LightGBMTunerCV( params, trainval, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3), ) tuner.run() return tuner.best_params, tuner.best_score
def test_sample_train_set(self) -> None: sample_size = 3 X_trn = np.random.uniform(10, size=50).reshape((10, 5)) y_trn = np.random.randint(2, size=10) train_dataset = lgb.Dataset(X_trn, label=y_trn) runner = self._get_tuner_object( train_set=train_dataset, kwargs_options=dict(sample_size=sample_size) ) runner.sample_train_set() # Workaround for mypy. if not TYPE_CHECKING: runner.train_subset.construct() # Cannot get label before construct `lgb.Dataset`. assert runner.train_subset.get_label().shape[0] == sample_size
def test_resume_run(self) -> None: params = {"verbose": -1} # type: Dict dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTunerCV(params, dataset, study=study) with mock.patch.object(OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0]): tuner.tune_regularization_factors() n_trials = len(study.trials) assert n_trials == len(study.trials) tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) with mock.patch.object(OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0]): tuner2.tune_regularization_factors() assert n_trials == len(study.trials)
def test_resume_run(self) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner.tune_regularization_factors() n_trials = len(study.trials) assert n_trials == len(study.trials) tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner2.tune_regularization_factors() assert n_trials == len(study.trials)
def test_when_a_step_does_not_improve_best_score(self) -> None: params = {} # type: Dict valid_data = np.zeros((10, 10)) valid_sets = lgb.Dataset(valid_data) tuner = LightGBMTuner(params, None, valid_sets=valid_sets) assert not tuner.higher_is_better() with mock.patch("lightgbm.train"), mock.patch.object( _BaseTuner, "_get_booster_best_score", return_value=0.9): tuner.tune_feature_fraction() assert "feature_fraction" in tuner.best_params assert tuner.best_score == 0.9 # Assume that tuning `num_leaves` doesn't improve the `best_score`. with mock.patch("lightgbm.train"), mock.patch.object( _BaseTuner, "_get_booster_best_score", return_value=1.1): tuner.tune_num_leaves()
def test_get_booster_best_score__using_valid_names_as_str(self): # type: () -> None expected_value = 1.0 class DummyBooster(object): def __init__(self): # type: () -> None self.best_score = {'dev': {'binary_logloss': expected_value}} booster = DummyBooster() dummy_dataset = lgb.Dataset(None) tuner = BaseTuner(lgbm_kwargs={ 'valid_names': 'dev', 'valid_sets': dummy_dataset, }) val_score = tuner._get_booster_best_score(booster) assert val_score == expected_value
def test_run_show_progress_bar(self, show_progress_bar: bool, expected: int) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, time_budget=1, show_progress_bar=show_progress_bar, ) with mock.patch.object( _BaseTuner, "_get_booster_best_score", return_value=1.0 ), mock.patch("tqdm.tqdm") as mock_tqdm: tuner.run() assert mock_tqdm.call_count == expected
def test_run_verbosity(self, verbosity: int, level: int) -> None: # We need to reconstruct our default handler to properly capture stderr. optuna.logging._reset_library_root_logger() optuna.logging.set_verbosity(optuna.logging.INFO) params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) tuner = LightGBMTunerCV( params, dataset, study=study, verbosity=verbosity, time_budget=1 ) with mock.patch.object(_OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0]): tuner.run() assert optuna.logging.get_verbosity() == level assert tuner.lgbm_params["verbose"] == -1
def test_best_booster_with_model_dir(self) -> None: params = {"verbose": -1} # type: Dict dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() with TemporaryDirectory() as tmpdir: tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, model_dir=tmpdir ) with mock.patch.object(BaseTuner, "_get_booster_best_score", return_value=0.0): tuner.tune_regularization_factors() best_booster = tuner.get_best_booster() tuner2 = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, model_dir=tmpdir ) best_booster2 = tuner2.get_best_booster() assert best_booster.params == best_booster2.params
def test_create_stepwise_study(self, direction: str, overall_best: int) -> None: tuner = LightGBMTuner({}, None, valid_sets=lgb.Dataset(np.zeros((10, 10)))) def objective(trial: optuna.trial.Trial, value: float) -> float: trial.set_system_attr( optuna.integration._lightgbm_tuner.optimize._STEP_NAME_KEY, "step{:.0f}".format(value), ) return trial.suggest_uniform("x", value, value) study = optuna.create_study(direction=direction) study_step1 = tuner._create_stepwise_study(study, "step1") with pytest.raises(ValueError): study_step1.best_trial study_step1.optimize(lambda t: objective(t, 1), n_trials=1) study_step2 = tuner._create_stepwise_study(study, "step2") # `study` has a trial, but `study_step2` has no trials. with pytest.raises(ValueError): study_step2.best_trial study_step2.optimize(lambda t: objective(t, 2), n_trials=2) assert len(study_step1.trials) == 1 assert len(study_step2.trials) == 2 assert len(study.trials) == 3 assert study_step1.best_trial.value == 1 assert study_step2.best_trial.value == 2 assert study.best_trial.value == overall_best
def test_get_best_booster(self) -> None: unexpected_value = 20 # out of scope. params: Dict = {"verbose": -1, "lambda_l1": unexpected_value} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) with pytest.raises(ValueError): tuner.get_best_booster() with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=0.0): tuner.tune_regularization_factors() best_booster = tuner.get_best_booster() assert best_booster.params["lambda_l1"] != unexpected_value tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) # Resumed study does not have the best booster. with pytest.raises(ValueError): tuner2.get_best_booster()
def test_get_best_booster_with_error(self) -> None: params = {"verbose": -1} # type: Dict dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTunerCV(params, dataset, study=study, model_dir=None, return_cvbooster=True) # No trial is completed yet. with pytest.raises(ValueError): tuner.get_best_booster() with mock.patch.object(_OptunaObjectiveCV, "_get_cv_scores", return_value=[1.0]): tuner.tune_regularization_factors() tuner2 = LightGBMTunerCV(params, dataset, study=study, model_dir=None, return_cvbooster=True) # Resumed the study does not have the best booster. with pytest.raises(ValueError): tuner2.get_best_booster() with TemporaryDirectory() as tmpdir: tuner3 = LightGBMTunerCV(params, dataset, study=study, model_dir=tmpdir, return_cvbooster=True) # The booster was not saved hence not found in the `model_dir`. with pytest.raises(ValueError): tuner3.get_best_booster()