def construct_dataset(x: DataFrame,
                      y: Series,
                      location=None,
                      reference=None,
                      params=None,
                      save=False,
                      weight=None):
    try_import_lightgbm()
    import lightgbm as lgb

    dataset = lgb.Dataset(data=x,
                          label=y,
                          reference=reference,
                          free_raw_data=True,
                          params=params,
                          weight=weight)

    if save:
        assert location is not None
        saving_path = f'{location}.bin'
        if os.path.exists(saving_path):
            os.remove(saving_path)

        os.makedirs(os.path.dirname(saving_path), exist_ok=True)
        dataset.save_binary(saving_path)
        # dataset_binary = lgb.Dataset(location + '.bin', reference=reference, free_raw_data=False)# .construct()

    return dataset
def lgb_trial(args, reporter):
    """ Training script for hyperparameter evaluation of Gradient Boosting model """
    try:
        model, args, util_args = model_trial.prepare_inputs(args=args)

        try_import_lightgbm()
        import lightgbm as lgb

        dataset_train = lgb.Dataset(util_args.directory + util_args.dataset_train_filename)
        dataset_val = lgb.Dataset(util_args.directory + util_args.dataset_val_filename)
        X_val, y_val = load_pkl.load(util_args.directory + util_args.dataset_val_pkl_filename)

        fit_model_args = dict(dataset_train=dataset_train, dataset_val=dataset_val, **util_args.get('fit_kwargs', dict()))
        predict_proba_args = dict(X=X_val)
        model_trial.fit_and_save_model(model=model, params=args, fit_args=fit_model_args, predict_proba_args=predict_proba_args, y_val=y_val,
                                       time_start=util_args.time_start, time_limit=util_args.get('time_limit', None), reporter=reporter)
    except Exception as e:
        if not isinstance(e, TimeLimitExceeded):
            logger.exception(e, exc_info=True)
        reporter.terminate()
Ejemplo n.º 3
0
    def _fit(self, X_train=None, y_train=None, X_val=None, y_val=None, dataset_train=None, dataset_val=None, time_limit=None, num_gpus=0, **kwargs):
        start_time = time.time()
        params = self.params.copy()

        # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored.
        verbosity = kwargs.get('verbosity', 2)
        params = fixedvals_from_searchspaces(params)

        if verbosity <= 1:
            verbose_eval = False
        elif verbosity == 2:
            verbose_eval = 1000
        elif verbosity == 3:
            verbose_eval = 50
        else:
            verbose_eval = 1

        eval_metric, eval_metric_name = self.get_eval_metric()
        dataset_train, dataset_val = self.generate_datasets(X_train=X_train, y_train=y_train, params=params, X_val=X_val, y_val=y_val, dataset_train=dataset_train, dataset_val=dataset_val)
        gc.collect()

        num_boost_round = params.pop('num_boost_round', 1000)
        dart_retrain = params.pop('dart_retrain', False)  # Whether to retrain the model to get optimal iteration if model is trained in 'dart' mode.
        if num_gpus != 0:
            if 'device' not in params:
                # TODO: lightgbm must have a special install to support GPU: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version
                #  Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
                #  GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
                # TODO: Confirm if GPU is used in HPO (Probably not)
                # params['device'] = 'gpu'
                pass
        logger.log(15, f'Training Gradient Boosting Model for {num_boost_round} rounds...')
        logger.log(15, "with the following hyperparameter settings:")
        logger.log(15, params)

        num_rows_train = len(dataset_train.data)
        if 'min_data_in_leaf' in params:
            if params['min_data_in_leaf'] > num_rows_train:  # TODO: may not be necessary
                params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))

        # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
        if (dataset_val is not None) and (dataset_train is not None):
            modifier = 1 if num_rows_train <= 10000 else 10000 / num_rows_train
            early_stopping_rounds = max(round(modifier * 150), 10)
        else:
            early_stopping_rounds = 150

        callbacks = []
        valid_names = ['train_set']
        valid_sets = [dataset_train]
        if dataset_val is not None:
            reporter = kwargs.get('reporter', None)
            train_loss_name = self._get_train_loss_name() if reporter is not None else None
            if train_loss_name is not None:
                if 'metric' not in params or params['metric'] == '':
                    params['metric'] = train_loss_name
                elif train_loss_name not in params['metric']:
                    params['metric'] = f'{params["metric"]},{train_loss_name}'
            callbacks += [
                # Note: Don't use self.params_aux['max_memory_usage_ratio'] here as LightGBM handles memory per iteration optimally.  # TODO: Consider using when ratio < 1.
                early_stopping_custom(early_stopping_rounds, metrics_to_use=[('valid_set', eval_metric_name)], max_diff=None, start_time=start_time, time_limit=time_limit,
                                      ignore_dart_warning=True, verbose=False, manual_stop_file=False, reporter=reporter, train_loss_name=train_loss_name),
            ]
            valid_names = ['valid_set'] + valid_names
            valid_sets = [dataset_val] + valid_sets

        seed_val = params.pop('seed_value', 0)
        train_params = {
            'params': params,
            'train_set': dataset_train,
            'num_boost_round': num_boost_round,
            'valid_sets': valid_sets,
            'valid_names': valid_names,
            'callbacks': callbacks,
            'verbose_eval': verbose_eval,
        }
        if not isinstance(eval_metric, str):
            train_params['feval'] = eval_metric
        else:
            if 'metric' not in train_params['params'] or train_params['params']['metric'] == '':
                train_params['params']['metric'] = eval_metric
            elif eval_metric not in train_params['params']['metric']:
                train_params['params']['metric'] = f'{train_params["params"]["metric"]},{eval_metric}'
        if self.problem_type == SOFTCLASS:
            train_params['fobj'] = lgb_utils.softclass_lgbobj
        if seed_val is not None:
            train_params['params']['seed'] = seed_val
            random.seed(seed_val)
            np.random.seed(seed_val)

        # Train LightGBM model:
        try_import_lightgbm()
        import lightgbm as lgb
        with warnings.catch_warnings():
            # Filter harmless warnings introduced in lightgbm 3.0, future versions plan to remove: https://github.com/microsoft/LightGBM/issues/3379
            warnings.filterwarnings('ignore', message='Overriding the parameters from Reference Dataset.')
            warnings.filterwarnings('ignore', message='categorical_column in param dict is overridden.')
            self.model = lgb.train(**train_params)
            retrain = False
            if train_params['params'].get('boosting_type', '') == 'dart':
                if dataset_val is not None and dart_retrain and (self.model.best_iteration != num_boost_round):
                    retrain = True
                    if time_limit is not None:
                        time_left = time_limit + start_time - time.time()
                        if time_left < 0.5 * time_limit:
                            retrain = False
                    if retrain:
                        logger.log(15, f"Retraining LGB model to optimal iterations ('dart' mode).")
                        train_params.pop('callbacks')
                        train_params['num_boost_round'] = self.model.best_iteration
                        self.model = lgb.train(**train_params)
                    else:
                        logger.log(15, f"Not enough time to retrain LGB model ('dart' mode)...")

        if dataset_val is not None and not retrain:
            self.params_trained['num_boost_round'] = self.model.best_iteration
        else:
            self.params_trained['num_boost_round'] = self.model.current_iteration()
Ejemplo n.º 4
0
    def _fit(self,
             X=None,
             y=None,
             X_val=None,
             y_val=None,
             dataset_train=None,
             dataset_val=None,
             time_limit=None,
             num_gpus=0,
             sample_weight=None,
             sample_weight_val=None,
             verbosity=2,
             **kwargs):
        start_time = time.time()
        ag_params = self._get_ag_params()
        params = self._get_model_params()
        params = fixedvals_from_searchspaces(params)

        if verbosity <= 1:
            verbose_eval = False
        elif verbosity == 2:
            verbose_eval = 1000
        elif verbosity == 3:
            verbose_eval = 50
        else:
            verbose_eval = 1

        stopping_metric, stopping_metric_name = self._get_stopping_metric_internal(
        )
        dataset_train, dataset_val = self.generate_datasets(
            X=X,
            y=y,
            params=params,
            X_val=X_val,
            y_val=y_val,
            sample_weight=sample_weight,
            sample_weight_val=sample_weight_val,
            dataset_train=dataset_train,
            dataset_val=dataset_val)
        gc.collect()

        num_boost_round = params.pop('num_boost_round', 1000)
        dart_retrain = params.pop(
            'dart_retrain', False
        )  # Whether to retrain the model to get optimal iteration if model is trained in 'dart' mode.
        if num_gpus != 0:
            if 'device' not in params:
                # TODO: lightgbm must have a special install to support GPU: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version
                #  Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
                #  GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
                params['device'] = 'gpu'
                logger.log(
                    20,
                    f'\tTraining {self.name} with GPU, note that this may negatively impact model quality compared to CPU training.'
                )
        logger.log(
            15,
            f'Training Gradient Boosting Model for {num_boost_round} rounds...'
        )
        logger.log(15, "with the following hyperparameter settings:")
        logger.log(15, params)

        num_rows_train = len(dataset_train.data)
        if 'min_data_in_leaf' in params:
            if params[
                    'min_data_in_leaf'] > num_rows_train:  # TODO: may not be necessary
                params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))

        callbacks = []
        valid_names = ['train_set']
        valid_sets = [dataset_train]
        if dataset_val is not None:
            # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
            early_stopping_rounds = ag_params.get('ag.es', 'auto')
            if isinstance(early_stopping_rounds, str):
                early_stopping_rounds = self._get_early_stopping_rounds(
                    num_rows_train=num_rows_train,
                    strategy=early_stopping_rounds)
            if early_stopping_rounds is None:
                early_stopping_rounds = 999999
            reporter = kwargs.get('reporter', None)
            train_loss_name = self._get_train_loss_name(
            ) if reporter is not None else None
            if train_loss_name is not None:
                if 'metric' not in params or params['metric'] == '':
                    params['metric'] = train_loss_name
                elif train_loss_name not in params['metric']:
                    params['metric'] = f'{params["metric"]},{train_loss_name}'
            callbacks += [
                # Note: Don't use self.params_aux['max_memory_usage_ratio'] here as LightGBM handles memory per iteration optimally.  # TODO: Consider using when ratio < 1.
                early_stopping_custom(early_stopping_rounds,
                                      metrics_to_use=[('valid_set',
                                                       stopping_metric_name)],
                                      max_diff=None,
                                      start_time=start_time,
                                      time_limit=time_limit,
                                      ignore_dart_warning=True,
                                      verbose=False,
                                      manual_stop_file=False,
                                      reporter=reporter,
                                      train_loss_name=train_loss_name),
            ]
            valid_names = ['valid_set'] + valid_names
            valid_sets = [dataset_val] + valid_sets

        seed_val = params.pop('seed_value', 0)
        train_params = {
            'params': params,
            'train_set': dataset_train,
            'num_boost_round': num_boost_round,
            'valid_sets': valid_sets,
            'valid_names': valid_names,
            'callbacks': callbacks,
            'verbose_eval': verbose_eval,
        }
        if not isinstance(stopping_metric, str):
            train_params['feval'] = stopping_metric
        else:
            if 'metric' not in train_params['params'] or train_params[
                    'params']['metric'] == '':
                train_params['params']['metric'] = stopping_metric
            elif stopping_metric not in train_params['params']['metric']:
                train_params['params'][
                    'metric'] = f'{train_params["params"]["metric"]},{stopping_metric}'
        if self.problem_type == SOFTCLASS:
            train_params['fobj'] = lgb_utils.softclass_lgbobj
        if seed_val is not None:
            train_params['params']['seed'] = seed_val
            random.seed(seed_val)
            np.random.seed(seed_val)

        # Train LightGBM model:
        try_import_lightgbm()
        import lightgbm as lgb
        from lightgbm.basic import LightGBMError
        with warnings.catch_warnings():
            # Filter harmless warnings introduced in lightgbm 3.0, future versions plan to remove: https://github.com/microsoft/LightGBM/issues/3379
            warnings.filterwarnings(
                'ignore',
                message='Overriding the parameters from Reference Dataset.')
            warnings.filterwarnings(
                'ignore',
                message='categorical_column in param dict is overridden.')
            try:
                self.model = lgb.train(**train_params)
            except LightGBMError:
                if train_params['params'].get('device', 'cpu') != 'gpu':
                    raise
                else:
                    logger.warning(
                        'Warning: GPU mode might not be installed for LightGBM, GPU training raised an exception. Falling back to CPU training...'
                        'Refer to LightGBM GPU documentation: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version'
                        'One possible method is:'
                        '\tpip uninstall lightgbm -y'
                        '\tpip install lightgbm --install-option=--gpu')
                    train_params['params']['device'] = 'cpu'
                    self.model = lgb.train(**train_params)
            retrain = False
            if train_params['params'].get('boosting_type', '') == 'dart':
                if dataset_val is not None and dart_retrain and (
                        self.model.best_iteration != num_boost_round):
                    retrain = True
                    if time_limit is not None:
                        time_left = time_limit + start_time - time.time()
                        if time_left < 0.5 * time_limit:
                            retrain = False
                    if retrain:
                        logger.log(
                            15,
                            f"Retraining LGB model to optimal iterations ('dart' mode)."
                        )
                        train_params.pop('callbacks')
                        train_params[
                            'num_boost_round'] = self.model.best_iteration
                        self.model = lgb.train(**train_params)
                    else:
                        logger.log(
                            15,
                            f"Not enough time to retrain LGB model ('dart' mode)..."
                        )

        if dataset_val is not None and not retrain:
            self.params_trained['num_boost_round'] = self.model.best_iteration
        else:
            self.params_trained[
                'num_boost_round'] = self.model.current_iteration()
    def _callback(env):
        try_import_lightgbm()
        from lightgbm.callback import _format_eval_result, EarlyStopException
        if not cmp_op:
            _init(env)
        if not enabled[0]:
            return
        if train_loss_name is not None:
            train_loss_evals = [
                eval for eval in env.evaluation_result_list
                if eval[0] == 'train_set' and eval[1] == train_loss_name
            ]
            train_loss_val = train_loss_evals[0][2]
        else:
            train_loss_val = 0.0
        for i in indices_to_check:
            eval_result = env.evaluation_result_list[i]
            _, eval_metric, score, greater_is_better = eval_result
            if best_score_list[i] is None or cmp_op[i](score, best_score[i]):
                best_score[i] = score
                best_iter[i] = env.iteration
                best_score_list[i] = env.evaluation_result_list
                best_trainloss[i] = train_loss_val
            if reporter is not None:  # Report current best scores for iteration, used in HPO
                if i == indices_to_check[
                        0]:  # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric.
                    if cmp_op[i] == gt:
                        validation_perf = score
                    else:
                        validation_perf = -score
                    reporter(
                        epoch=env.iteration + 1,
                        validation_performance=validation_perf,
                        train_loss=best_trainloss[i],
                        best_iter_sofar=best_iter[i] + 1,
                        best_valperf_sofar=best_score[i],
                        eval_metric=
                        eval_metric,  # eval_metric here is the stopping_metric from LGBModel
                        greater_is_better=greater_is_better,
                    )
            if env.iteration - best_iter[i] >= stopping_rounds:
                if verbose:
                    logger.log(
                        15, 'Early stopping, best iteration is:\n[%d]\t%s' %
                        (best_iter[i] + 1, '\t'.join([
                            _format_eval_result(x) for x in best_score_list[i]
                        ])))
                raise EarlyStopException(best_iter[i], best_score_list[i])
            elif (max_diff
                  is not None) and (abs(score - best_score[i]) > max_diff):
                if verbose:
                    logger.debug('max_diff breached!')
                    logger.debug(abs(score - best_score[i]))
                    logger.log(
                        15, 'Early stopping, best iteration is:\n[%d]\t%s' %
                        (best_iter[i] + 1, '\t'.join([
                            _format_eval_result(x) for x in best_score_list[i]
                        ])))
                raise EarlyStopException(best_iter[i], best_score_list[i])
            if env.iteration == env.end_iteration - 1:
                if verbose:
                    logger.log(
                        15,
                        'Did not meet early stopping criterion. Best iteration is:\n[%d]\t%s'
                        % (best_iter[i] + 1, '\t'.join([
                            _format_eval_result(x) for x in best_score_list[i]
                        ])))
                raise EarlyStopException(best_iter[i], best_score_list[i])
            if verbose:
                logger.debug((env.iteration - best_iter[i], eval_result))
        if manual_stop_file:
            if os.path.exists(manual_stop_file):
                i = indices_to_check[0]
                logger.log(
                    20,
                    'Found manual stop file, early stopping. Best iteration is:\n[%d]\t%s'
                    % (best_iter[i] + 1, '\t'.join(
                        [_format_eval_result(x) for x in best_score_list[i]])))
                raise EarlyStopException(best_iter[i], best_score_list[i])
        if time_limit:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
            if time_left <= 0:
                i = indices_to_check[0]
                logger.log(
                    20, '\tRan out of time, early stopping on iteration ' +
                    str(env.iteration + 1) +
                    '. Best iteration is:\n\t[%d]\t%s' %
                    (best_iter[i] + 1, '\t'.join(
                        [_format_eval_result(x) for x in best_score_list[i]])))
                raise EarlyStopException(best_iter[i], best_score_list[i])

        # TODO: Add toggle parameter to early_stopping to disable this
        # TODO: Identify optimal threshold values for early_stopping based on lack of memory
        if env.iteration % 10 == 0:
            available = psutil.virtual_memory().available
            cur_rss = mem_status.memory_info().rss

            if cur_rss < init_mem_rss[0]:
                init_mem_rss[0] = cur_rss
            estimated_model_size_mb = (cur_rss - init_mem_rss[0]) >> 20
            available_mb = available >> 20

            model_size_memory_ratio = estimated_model_size_mb / available_mb
            if verbose or (model_size_memory_ratio > 0.25):
                logging.debug('Available Memory: ' + str(available_mb) + ' MB')
                logging.debug('Estimated Model Size: ' +
                              str(estimated_model_size_mb) + ' MB')

            early_stop = False
            if model_size_memory_ratio > 1.0:
                logger.warning(
                    'Warning: Large GBM model size may cause OOM error if training continues'
                )
                logger.warning('Available Memory: ' + str(available_mb) +
                               ' MB')
                logger.warning('Estimated GBM model size: ' +
                               str(estimated_model_size_mb) + ' MB')
                early_stop = True

            # TODO: We will want to track size of model as well, even if we early stop before OOM, we will still crash when saving if the model is large enough
            if available_mb < 512:  # Less than 500 MB
                logger.warning(
                    'Warning: Low available memory may cause OOM error if training continues'
                )
                logger.warning('Available Memory: ' + str(available_mb) +
                               ' MB')
                logger.warning('Estimated GBM model size: ' +
                               str(estimated_model_size_mb) + ' MB')
                early_stop = True

            if early_stop:
                logger.warning(
                    'Warning: Early stopped GBM model prior to optimal result to avoid OOM error. Please increase available memory to avoid subpar model quality.'
                )
                logger.log(
                    15, 'Early stopping, best iteration is:\n[%d]\t%s' %
                    (best_iter[0] + 1, '\t'.join(
                        [_format_eval_result(x) for x in best_score_list[0]])))
                raise EarlyStopException(best_iter[0], best_score_list[0])