예제 #1
0
class SoftclassObjective(object):
    try_import_catboost(
    )  # Need to first import catboost then catboost_dev not vice-versa.
    try_import_catboostdev()
    from catboost_dev import MultiRegressionCustomObjective

    def __init__(self):
        try_import_catboostdev()
        self.softlogloss = self.SoftLogLossObjective(
        )  # the objective object to pass to CatBoostRegressor

    class SoftLogLossObjective(MultiRegressionCustomObjective):
        # TODO: Consider replacing with C++ implementation (but requires building catboost from source).
        # This pure Python is 3x faster than optimized Numpy implementation. Tested C++ implementation was 3x faster than this one.
        def calc_ders_multi(self, approxes, targets, weight):
            exp_approx = [math.exp(val) for val in approxes]
            exp_sum = sum(exp_approx)
            exp_approx = [val / exp_sum for val in exp_approx]
            grad = [(targets[j] - exp_approx[j]) * weight
                    for j in range(len(targets))]
            hess = [[(exp_approx[j] * exp_approx[j2] -
                      (j == j2) * exp_approx[j]) * weight
                     for j in range(len(targets))]
                    for j2 in range(len(targets))]
            return (grad, hess)
예제 #2
0
class SoftclassCustomMetric(CustomMetric):
    try_import_catboost(
    )  # Need to first import catboost then catboost_dev not vice-versa.
    try_import_catboostdev()
    from catboost_dev import MultiRegressionCustomMetric

    def __init__(self, metric, is_higher_better,
                 needs_pred_proba):  # metric is ignored
        super().__init__(metric, is_higher_better, needs_pred_proba)
        try_import_catboostdev()
        self.softlogloss = self.SoftLogLossMetric(
        )  # the metric object to pass to CatBoostRegressor

    def evaluate(self, approxes, target, weight):
        return self.softlogloss.evaluate(approxes, target, weight)

    class SoftLogLossMetric(MultiRegressionCustomMetric):
        def get_final_error(self, error, weight):
            return error

        def is_max_optimal(self):
            return True

        def evaluate(self, approxes, target, weight):
            assert len(target) == len(approxes)
            assert len(target[0]) == len(approxes[0])
            weight_sum = len(target)
            approxes = np.array(approxes)
            approxes = np.exp(approxes)
            approxes = np.multiply(approxes,
                                   1 / np.sum(approxes, axis=1)[:, np.newaxis])
            error_sum = soft_log_loss(np.array(target), np.array(approxes))
            return error_sum, weight_sum
예제 #3
0
 def __init__(self):
     try_import_catboostdev()
     self.softlogloss = self.SoftLogLossObjective(
     )  # the objective object to pass to CatBoostRegressor
예제 #4
0
 def __init__(self, metric, is_higher_better,
              needs_pred_proba):  # metric is ignored
     super().__init__(metric, is_higher_better, needs_pred_proba)
     try_import_catboostdev()
     self.softlogloss = self.SoftLogLossMetric(
     )  # the metric object to pass to CatBoostRegressor
예제 #5
0
    def _fit(self,
             X_train,
             y_train,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_gpus=0,
             **kwargs):
        try_import_catboost()
        from catboost import CatBoostClassifier, CatBoostRegressor, Pool
        if self.problem_type == SOFTCLASS:
            try_import_catboostdev(
            )  # Need to first import catboost then catboost_dev not vice-versa.
            from catboost_dev import CatBoostClassifier, CatBoostRegressor, Pool
            from .catboost_softclass_utils import SoftclassCustomMetric, SoftclassObjective
            self._set_default_param_value(
                'eval_metric',
                construct_custom_catboost_metric(
                    self.stopping_metric, True,
                    not self.stopping_metric.needs_pred, self.problem_type))
            self.params[
                'loss_function'] = SoftclassObjective.SoftLogLossObjective()
            self.params[
                'eval_metric'] = SoftclassCustomMetric.SoftLogLossMetric()
            self._set_default_param_value(
                'early_stopping_rounds',
                50)  # Speeds up training with custom (non-C++) losses

        model_type = CatBoostClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else CatBoostRegressor
        if isinstance(self.params['eval_metric'], str):
            metric_name = self.params['eval_metric']
        else:
            metric_name = type(self.params['eval_metric']).__name__
        num_rows_train = len(X_train)
        num_cols_train = len(X_train.columns)
        if self.problem_type == MULTICLASS:
            if self.num_classes is not None:
                num_classes = self.num_classes
            else:
                num_classes = 10  # Guess if not given, can do better by looking at y_train
        elif self.problem_type == SOFTCLASS:  # TODO: delete this elif if it's unnecessary.
            num_classes = y_train.shape[1]
            self.num_classes = num_classes
        else:
            num_classes = 1

        # TODO: Add ignore_memory_limits param to disable NotEnoughMemoryError Exceptions
        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2  # TODO: Extremely crude approximation, can be vastly improved
        if approx_mem_size_req > 1e9:  # > 1 GB
            available_mem = psutil.virtual_memory().available
            ratio = approx_mem_size_req / available_mem
            if ratio > (1 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))
                raise NotEnoughMemoryError
            elif ratio > (0.2 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Potentially not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))

        start_time = time.time()
        X_train = self.preprocess(X_train)
        cat_features = list(X_train.select_dtypes(include='category').columns)
        X_train = Pool(data=X_train, label=y_train, cat_features=cat_features)

        if X_val is not None:
            X_val = self.preprocess(X_val)
            X_val = Pool(data=X_val, label=y_val, cat_features=cat_features)
            eval_set = X_val
            if num_rows_train <= 10000:
                modifier = 1
            else:
                modifier = 10000 / num_rows_train
            early_stopping_rounds = max(round(modifier * 150), 10)
            num_sample_iter_max = max(round(modifier * 50), 2)
        else:
            eval_set = None
            early_stopping_rounds = None
            num_sample_iter_max = 50

        invalid_params = ['num_threads', 'num_gpus']
        for invalid in invalid_params:
            if invalid in self.params:
                self.params.pop(invalid)
        train_dir = None
        if 'allow_writing_files' in self.params and self.params[
                'allow_writing_files']:
            if 'train_dir' not in self.params:
                try:
                    # TODO: What if path is in S3?
                    os.makedirs(os.path.dirname(self.path), exist_ok=True)
                except:
                    pass
                else:
                    train_dir = self.path + 'catboost_info'
        logger.log(15, f'\tCatboost model hyperparameters: {self.params}')

        # TODO: Add more control over these params (specifically early_stopping_rounds)
        verbosity = kwargs.get('verbosity', 2)
        if verbosity <= 1:
            verbose = False
        elif verbosity == 2:
            verbose = False
        elif verbosity == 3:
            verbose = 20
        else:
            verbose = True

        init_model = None
        init_model_tree_count = None
        init_model_best_iteration = None
        init_model_best_score = None

        params = self.params.copy()
        num_features = len(self.features)

        # FIXME: v0.1 add default as CPU and GPU only if explicitly specified in ag_args_fit.
        #  Fix num_gpus > 0 and time_limit is not None : Currently crashes.
        if num_gpus != 0:
            logger.warning(
                'WARNING: Attempted to run CatBoost with GPU. CatBoost GPU implementation is not yet implemented, using CPU instead.'
            )
            num_gpus = 0

        if num_gpus != 0:
            if 'task_type' not in params:
                params['task_type'] = 'GPU'
                # TODO: Confirm if GPU is used in HPO (Probably not)
                # TODO: Adjust max_bins to 254?

        if params.get('task_type', None) == 'GPU':
            if 'colsample_bylevel' in params:
                params.pop('colsample_bylevel')
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )
            if 'rsm' in params:
                params.pop('rsm')
                logger.log(
                    30,
                    f'\t\'rsm\' is not supported on GPU, using default value (Default = 1).'
                )

        if self.problem_type == MULTICLASS and 'rsm' not in params and 'colsample_bylevel' not in params and num_features > 1000:
            if time_limit:
                # Reduce sample iterations to avoid taking unreasonable amounts of time
                num_sample_iter_max = max(round(num_sample_iter_max / 2), 2)
            # Subsample columns to speed up training
            if params.get('task_type',
                          None) != 'GPU':  # RSM does not work on GPU
                params['colsample_bylevel'] = max(
                    min(1.0, 1000 / num_features), 0.05)
                logger.log(
                    30,
                    f'\tMany features detected ({num_features}), dynamically setting \'colsample_bylevel\' to {params["colsample_bylevel"]} to speed up training (Default = 1).'
                )
                logger.log(
                    30,
                    f'\tTo disable this functionality, explicitly specify \'colsample_bylevel\' in the model hyperparameters.'
                )
            else:
                params['colsample_bylevel'] = 1.0
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )

        if time_limit:
            time_left_start = time_limit - (time.time() - start_time)
            if time_left_start <= time_limit * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded
            params_init = params.copy()
            num_sample_iter = min(num_sample_iter_max,
                                  params_init['iterations'])
            params_init['iterations'] = num_sample_iter
            if train_dir is not None:
                params_init['train_dir'] = train_dir
            self.model = model_type(**params_init, )
            self.model.fit(
                X_train,
                eval_set=eval_set,
                use_best_model=True,
                verbose=verbose,
                # early_stopping_rounds=early_stopping_rounds,
            )

            init_model_tree_count = self.model.tree_count_
            init_model_best_iteration = self.model.get_best_iteration()
            init_model_best_score = self.model.get_best_score(
            )['validation'][metric_name]

            time_left_end = time_limit - (time.time() - start_time)
            time_taken_per_iter = (time_left_start -
                                   time_left_end) / num_sample_iter
            estimated_iters_in_time = round(time_left_end /
                                            time_taken_per_iter)
            init_model = self.model

            params_final = params.copy()

            # TODO: This only handles memory with time_limit specified, but not with time_limit=None, handle when time_limit=None
            available_mem = psutil.virtual_memory().available
            if self.problem_type == SOFTCLASS:  # TODO: remove this once catboost-dev is no longer necessary and SOFTCLASS objectives can be pickled.
                model_size_bytes = 1  # skip memory check
            else:
                model_size_bytes = sys.getsizeof(pickle.dumps(self.model))

            max_memory_proportion = 0.3 * max_memory_usage_ratio
            mem_usage_per_iter = model_size_bytes / num_sample_iter
            max_memory_iters = math.floor(
                available_mem * max_memory_proportion / mem_usage_per_iter)

            params_final['iterations'] = min(
                params['iterations'] - num_sample_iter,
                estimated_iters_in_time)
            if params_final['iterations'] > max_memory_iters - num_sample_iter:
                if max_memory_iters - num_sample_iter <= 500:
                    logger.warning(
                        '\tWarning: CatBoost will be early stopped due to lack of memory, increase memory to enable full quality models, max training iterations changed to %s from %s'
                        % (max_memory_iters,
                           params_final['iterations'] + num_sample_iter))
                params_final['iterations'] = max_memory_iters - num_sample_iter
        else:
            params_final = params.copy()

        if train_dir is not None:
            params_final['train_dir'] = train_dir
        if params_final['iterations'] > 0:
            self.model = model_type(**params_final, )

            # TODO: Strangely, this performs different if clone init_model is sent in than if trained for same total number of iterations. May be able to optimize catboost models further with this
            self.model.fit(
                X_train,
                eval_set=eval_set,
                verbose=verbose,
                early_stopping_rounds=early_stopping_rounds,
                # use_best_model=True,
                init_model=init_model,
            )

            if init_model is not None:
                final_model_best_score = self.model.get_best_score(
                )['validation'][metric_name]
                if self.stopping_metric._optimum > final_model_best_score:
                    if final_model_best_score > init_model_best_score:
                        best_iteration = init_model_tree_count + self.model.get_best_iteration(
                        )
                    else:
                        best_iteration = init_model_best_iteration
                else:
                    if final_model_best_score < init_model_best_score:
                        best_iteration = init_model_tree_count + self.model.get_best_iteration(
                        )
                    else:
                        best_iteration = init_model_best_iteration

                self.model.shrink(ntree_start=0, ntree_end=best_iteration + 1)

        self.params_trained['iterations'] = self.model.tree_count_