def train(self, f_in, f_mod=None, init_model=None, handlers=None): (atstart, atiter, atfinish) = handlers if handlers else (None, None, None) (xs, ys) = trains.load(f_in) dtrain = lgb.Dataset(xs, label=ys, free_raw_data=(init_model is None)) #dtrain.construct() pos = sum(ys) neg = len(ys) - pos self.stats["train.counts"] = (len(ys), int(pos), int(neg)) self.params["scale_pos_weight"] = (neg / pos) #self.params["is_unbalance"] = True callbacks = [lambda _: atiter(), lgb.log_evaluation(1)] if atiter else None if atstart: atstart() #eta = self.params["learning_rate"] bst = lgb.train(self.params, dtrain, valid_sets=[dtrain], init_model=init_model, callbacks=callbacks ) #, learning_rates=lambda iter: 0.1*(0.95**iter)) if atfinish: atfinish() if f_mod: bst.save_model(f_mod) bst.free_dataset() bst.free_network() return bst
def test_best_booster_with_model_dir(self) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() with TemporaryDirectory() as tmpdir: tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, model_dir=tmpdir, callbacks=[log_evaluation(-1)], ) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=0.0): tuner.tune_regularization_factors() best_booster = tuner.get_best_booster() tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study, model_dir=tmpdir) best_booster2 = tuner2.get_best_booster() assert best_booster.params == best_booster2.params
def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): """ finetune model Parameters ---------- dataset : DatasetH dataset for finetuning num_boost_round : int number of round to finetune model verbose_eval : int verbose level """ # Based on existing model and finetune by train more rounds dtrain, _ = self._prepare_data(dataset) verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) self.model = lgb.train( self.params, dtrain, num_boost_round=num_boost_round, init_model=self.model, valid_sets=[dtrain], valid_names=["train"], callbacks=[verbose_eval_callback], )
def model(params, dtrain, testd, f_mod, barmsg="lgb"): f_log = f_mod + ".log" if barmsg: ProgressBar.file = None bar = ProgressBar(barmsg, max=params["num_round"]) if barmsg else None else: bar = None logger.debug("- building model %s" % f_mod) redir = redirect.start(f_log, bar) try: if bar: bar.start() begin = time.time() bst = lgb.train(params, dtrain, valid_sets=[dtrain], callbacks=[lgb.log_evaluation(1)] + ([lambda _: bar.next()] if bar else [])) end = time.time() bst.save_model(f_mod) if bar: bar.finish() bar.file.flush() (xs0, ys0) = testd acc = accuracy(bst, xs0, ys0) bst.free_dataset() bst.free_network() except Exception as e: redirect.finish(*redir) raise e redirect.finish(*redir) score = POS_ACC_WEIGHT * acc[1] + acc[2] return (score, acc, end - begin)
def test_get_best_booster(self) -> None: unexpected_value = 20 # out of scope. params: Dict = {"verbose": -1, "lambda_l1": unexpected_value} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner(params, dataset, valid_sets=dataset, study=study, callbacks=[log_evaluation(-1)]) with pytest.raises(ValueError): tuner.get_best_booster() with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=0.0): tuner.tune_regularization_factors() best_booster = tuner.get_best_booster() assert best_booster.params["lambda_l1"] != unexpected_value tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) # Resumed study does not have the best booster. with pytest.raises(ValueError): tuner2.get_best_booster()
def test_run_verbosity(self, verbosity: int, level: int) -> None: # We need to reconstruct our default handler to properly capture stderr. optuna.logging._reset_library_root_logger() optuna.logging.set_verbosity(optuna.logging.INFO) params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, verbosity=verbosity, callbacks=[log_evaluation(-1)], time_budget=1, ) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner.run() assert optuna.logging.get_verbosity() == level assert tuner.lgbm_params["verbose"] == -1
def test_resume_run(self) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner(params, dataset, valid_sets=dataset, study=study, callbacks=[log_evaluation(-1)]) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner.tune_regularization_factors() n_trials = len(study.trials) assert n_trials == len(study.trials) tuner2 = LightGBMTuner(params, dataset, valid_sets=dataset, study=study) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner2.tune_regularization_factors() assert n_trials == len(study.trials)
def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20, reweighter=None): """ finetune model Parameters ---------- dataset : DatasetH dataset for finetuning num_boost_round : int number of round to finetune model verbose_eval : int verbose level """ # Based on existing model and finetune by train more rounds dtrain, _ = self._prepare_data(dataset, reweighter) # pylint: disable=W0632 if dtrain.empty: raise ValueError( "Empty data from dataset, please check your dataset config.") verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) self.model = lgb.train( self.params, dtrain, num_boost_round=num_boost_round, init_model=self.model, valid_sets=[dtrain], valid_names=["train"], callbacks=[verbose_eval_callback], )
def fit( self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=None, ): if evals_result is None: evals_result = dict() dtrain, dvalid = self._prepare_data(dataset) early_stopping_callback = lgb.early_stopping(early_stopping_rounds) verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, dtrain, num_boost_round=num_boost_round, valid_sets=[dtrain, dvalid], valid_names=["train", "valid"], callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], ) evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0]
def _train( self, params: Dict[str, Any], lgb_train: lgb.Dataset, eval_sets: List[lgb.Dataset], eval_names: List[str], ) -> lgb.Booster: """Trains a LightGBM model. Args: params: parameters for LightGBM lgb_train: LightGBM dataset for training eval_sets: LightGBM datasets for evaluation eval_names: names of the evaluation datasets Returns: LightGBM Booster model """ gbm = lgb.train( params, lgb_train, num_boost_round=self.num_boost_round, valid_sets=eval_sets, valid_names=eval_names, feature_name=list(self.model.input_features.keys()), # NOTE: hummingbird does not support categorical features # categorical_feature=categorical_features, callbacks=[ lgb.early_stopping(stopping_rounds=self.early_stop), lgb.log_evaluation(), ], ) return gbm
def test_log_evaluation_callback_is_picklable(serializer): periods = 42 callback = lgb.log_evaluation(period=periods) callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer) assert callback_from_disk.order == 10 assert callback_from_disk.before_iteration is False assert callback.period == callback_from_disk.period assert callback.period == periods
def test_tune_best_score_reproducibility(self) -> None: california = sklearn.datasets.fetch_california_housing() X_trainval, X_test, y_trainval, y_test = train_test_split( california.data, california.target, random_state=0) train = lgb.Dataset(X_trainval, y_trainval) valid = lgb.Dataset(X_test, y_test) params = { "objective": "regression", "metric": "rmse", "random_seed": 0, "deterministic": True, "force_col_wise": True, "verbosity": -1, } tuner_first_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, callbacks=[log_evaluation(-1)], ) tuner_first_try.run() best_score_first_try = tuner_first_try.best_score tuner_second_try = lgb.LightGBMTuner( params, train, valid_sets=valid, early_stopping_rounds=3, optuna_seed=10, callbacks=[log_evaluation(-1)], ) tuner_second_try.run() best_score_second_try = tuner_second_try.best_score assert best_score_second_try == best_score_first_try
def test_tune_num_leaves_negative_max_depth(self) -> None: params: Dict[str, Any] = { "metric": "binary_logloss", "max_depth": -1, "verbose": -1 } X_trn = np.random.uniform(10, size=(10, 5)) y_trn = np.random.randint(2, size=10) train_dataset = lgb.Dataset(X_trn, label=y_trn) valid_dataset = lgb.Dataset(X_trn, label=y_trn) runner = lgb.LightGBMTuner( params, train_dataset, num_boost_round=3, early_stopping_rounds=2, valid_sets=valid_dataset, callbacks=[log_evaluation(-1)], ) runner.tune_num_leaves() assert len(runner.study.trials) == 20
def test_run_show_progress_bar(self, show_progress_bar: bool, expected: int) -> None: params: Dict = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) study = optuna.create_study() tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, callbacks=[log_evaluation(-1)], time_budget=1, show_progress_bar=show_progress_bar, ) with mock.patch.object( _BaseTuner, "_get_booster_best_score", return_value=1.0), mock.patch("tqdm.tqdm") as mock_tqdm: tuner.run() assert mock_tqdm.call_count == expected
def fit( self, dataset: DatasetH, num_boost_round=None, early_stopping_rounds=None, verbose_eval=20, evals_result=None, reweighter=None, **kwargs, ): if evals_result is None: evals_result = {} # in case of unsafety of Python default values ds_l = self._prepare_data(dataset, reweighter) ds, names = list(zip(*ds_l)) early_stopping_callback = lgb.early_stopping( self.early_stopping_rounds if early_stopping_rounds is None else early_stopping_rounds) # NOTE: if you encounter error here. Please upgrade your lightgbm verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, ds[0], # training dataset num_boost_round=self.num_boost_round if num_boost_round is None else num_boost_round, valid_sets=ds, valid_names=names, callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], **kwargs, ) for k in names: for key, val in evals_result[k].items(): name = f"{key}.{k}" for epoch, m in enumerate(val): R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
def test_optuna_callback(self) -> None: params: Dict[str, Any] = {"verbose": -1} dataset = lgb.Dataset(np.zeros((10, 10))) callback_mock = mock.MagicMock() study = optuna.create_study() tuner = LightGBMTuner( params, dataset, valid_sets=dataset, study=study, callbacks=[log_evaluation(-1)], optuna_callbacks=[callback_mock], ) with mock.patch.object(_BaseTuner, "_get_booster_best_score", return_value=1.0): tuner._tune_params(["num_leaves"], 10, optuna.samplers.TPESampler(), "num_leaves") assert callback_mock.call_count == 10
'verbose': 0 } evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], categorical_feature=[21], callbacks=[ lgb.log_evaluation(10), lgb.record_evaluation(evals_result) ] ) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') plt.show()
def test_register_logger(tmp_path): logger = logging.getLogger("LightGBM") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)s | %(message)s') log_filename = tmp_path / "LightGBM_test_logger.log" file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8") file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) def dummy_metric(_, __): logger.debug('In dummy_metric') return 'dummy_metric', 1, True lgb.register_logger(logger) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) y = np.array([0, 1, 1, 0]) lgb_data = lgb.Dataset(X, y) eval_records = {} callbacks = [ lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(4) ] lgb.train({ 'objective': 'binary', 'metric': ['auc', 'binary_error'] }, lgb_data, num_boost_round=10, feval=dummy_metric, valid_sets=[lgb_data], categorical_feature=[1], callbacks=callbacks) lgb.plot_metric(eval_records) expected_log = r""" INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant. INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2 INFO | [LightGBM] [Info] Total Bins 0 INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0 INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | Training until validation scores don't improve for 4 rounds INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [2] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [4] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [6] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [8] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [10] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | Did not meet early stopping. Best iteration is: [1] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 WARNING | More than one metric available, picking one to plot. """.strip() gpu_lines = [ "INFO | [LightGBM] [Info] This is the GPU trainer", "INFO | [LightGBM] [Info] Using GPU Device:", "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...", "INFO | [LightGBM] [Info] GPU programs have been built", "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found", "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.", "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] with open(log_filename, "rt", encoding="utf-8") as f: actual_log = f.read().strip() actual_log_wo_gpu_stuff = [] for line in actual_log.split("\n"): if not any(line.startswith(gpu_line) for gpu_line in gpu_lines): actual_log_wo_gpu_stuff.append(line) assert "\n".join(actual_log_wo_gpu_stuff) == expected_log