def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) if isinstance(X, dt.Frame): orig_cols = list(X.names) X = X.to_numpy() if eval_set is not None: valid_X = eval_set[0][0].to_numpy() # don't assign back to X so don't damage during predict valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set[0] = (valid_X, valid_y) model = CatBoostClassifier(**self.params) model.fit(X, y=y, sample_weight=sample_weight, baseline=None, eval_set=eval_set, early_stopping_rounds=kwargs.get('early_stopping_rounds', None), verbose=self.params.get('verbose', False) ) # need to move to wrapper if model.get_best_iteration() is not None: iterations = model.get_best_iteration() + 1 else: iterations = self.params['iterations'] + 1 # must always set best_iterations self.set_model_properties(model=model, features=orig_cols, importances=model.feature_importances_, iterations=iterations)
class CatBoost(BaseModel): ''' Wrapper class of LightGBM. self.core contains Booster. ''' @timer def __init__(self, config): self.config = config @timer def train(self, X_train, y_train, X_val=None, y_val=None, params=None, num_boost_round=100, early_stopping_rounds=None, fold=0): self.core = CatBoostClassifier( # **self.config.params, **params, num_boost_round=num_boost_round) self.core.fit( X=X_train, y=y_train, eval_set=(X_val, y_val), # verbose=True, early_stopping_rounds=early_stopping_rounds) return self @timer def predict(self, X_test): y_test = self.core.predict_proba(X_test)[:, 1] return y_test @property def feature_importance(self): return self.core.get_feature_importance() @property def best_iteration(self): return self.core.get_best_iteration() @property def evals_result(self): return self.core.get_evals_result()
def fit_catboost(self, X, y, X_val, y_val): logging.info('- Fit catboost model') model = CatBoostClassifier(iterations=10000, eval_metric='AUC') model.fit(X, y, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100) best_score = model.get_best_score()['validation']['AUC'] best_iteration = model.get_best_iteration() self.models.append({ 'model': model, 'best_score': best_score, 'best_iteration': best_iteration }) logging.info('Best score = {:.2%}, in {} iterations'.format( best_score, best_iteration))
class CatBoostModel(CustomModel): _regression = True _binary = True _multiclass = True _display_name = "CatBoost" _description = "Yandex CatBoost GBM" _can_use_multi_gpu = False # Can enable, but consumes too much memory # WIP: leakage can't find _catboost module, unsure what special. Probably shift would fail too if used catboost. _can_use_gpu = True _force_gpu = False # force use of GPU regardless of what DAI says _can_handle_categorical = True _can_handle_non_numeric = True _can_handle_text = False # catboost has issues when text is arbitrary and entirely unique across all rows _used_return_params = True _average_return_params = True _fit_by_iteration = True _fit_iteration_name = 'n_estimators' _is_gbm = True # ensure final model changes n_estimators and learning_rate and complain if early stopping didn't work. _predict_by_iteration = True _predict_iteration_name = 'ntree_end' _save_by_pickle = True # if False, use catboost save/load model as intermediate binary file _testing_can_skip_failure = False # ensure tested as if shouldn't fail # Increase gpu_ram_part if know system is isolated _make_logger = True # set to True to make logger _show_logger_test = False # set to True to see how to send information to experiment logger _show_task_test = False # set to True to see how task is used to send message to GUI _min_one_hot_max_size = 4 _min_learning_rate_catboost = 0.005 # for catboost often for same low learning rate as xgb/lgb, too many trees def __init__(self, context=None, unfitted_pipeline_path=None, transformed_features=None, original_user_cols=None, date_format_strings=None, **kwargs): super().__init__(context=context, unfitted_pipeline_path=unfitted_pipeline_path, transformed_features=transformed_features, original_user_cols=original_user_cols, date_format_strings=date_format_strings, **kwargs) self.input_dict = dict(context=context, unfitted_pipeline_path=unfitted_pipeline_path, transformed_features=transformed_features, original_user_cols=original_user_cols, date_format_strings=date_format_strings, **kwargs) @staticmethod def is_enabled(): return not (arch_type == "ppc64le") @staticmethod def do_acceptance_test(): return True @staticmethod def acceptance_test_timeout(): return 20.0 @property def has_pred_contribs(self): return True @property def has_output_margin(self): return True _modules_needed_by_name = ['catboost==0.26.1'] def set_default_params(self, accuracy=10, time_tolerance=10, interpretability=1, **kwargs): # https://catboost.ai/docs/concepts/python-reference_parameters-list.html # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # optimize for final model as transcribed from best lightgbm model n_estimators = self.params_base.get('n_estimators', 100) learning_rate = self.params_base.get('learning_rate', config.min_learning_rate) early_stopping_rounds_default = min(500, max(1, int(n_estimators / 4))) early_stopping_rounds = self.params_base.get( 'early_stopping_rounds', early_stopping_rounds_default) self.params = { 'bootstrap_type': 'Bayesian', 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'early_stopping_rounds': early_stopping_rounds, 'max_depth': 8, 'grow_policy': 'depthwise', } dummy = kwargs.get('dummy', False) ensemble_level = kwargs.get('ensemble_level', 0) train_shape = kwargs.get('train_shape', (1, 1)) valid_shape = kwargs.get('valid_shape', (1, 1)) self.get_gbm_main_params_evolution(params=self.params, dummy=dummy, accuracy=accuracy, num_classes=self.num_classes, ensemble_level=ensemble_level, train_shape=train_shape, valid_shape=valid_shape) for k in kwargs: if k in self.params: self.params[k] = copy.deepcopy(kwargs[k]) # self.params['has_time'] # should use this if TS problem if self._can_handle_categorical: # less than 2 is risky, can get stuck in learning max_cat_to_onehot_list = [ 4, 10, 20, 40, config.max_int_as_cat_uniques ] self.params['one_hot_max_size'] = MainModel.get_one( max_cat_to_onehot_list, get_best=True) uses_gpus, n_gpus = self.get_uses_gpus(self.params) if uses_gpus: self.params['one_hot_max_size'] = min( self.params['one_hot_max_size'], 255) else: self.params['one_hot_max_size'] = min( self.params['one_hot_max_size'], 65535) self.params['learning_rate'] = max(self._min_learning_rate_catboost, self.params['learning_rate']) # fill mutatable params with best for left over if default didn't fill params = copy.deepcopy(self.params) self.mutate_params(accuracy=accuracy, time_tolerance=time_tolerance, interpretability=interpretability, get_best=True, **kwargs) params_from_mutate = copy.deepcopy(self.params) for k in params_from_mutate: if k not in params: params[k] = params_from_mutate[k] self.params = copy.deepcopy(params) def mutate_params(self, **kwargs): fake_lgbm_model = LightGBMModel(**self.input_dict) fake_lgbm_model.params = self.params.copy() fake_lgbm_model.params_base = self.params_base.copy() for k, v in fake_lgbm_model.params_base.items(): if k in fake_lgbm_model.params: fake_lgbm_model.params[k] = fake_lgbm_model.params_base[k] kwargs['train_shape'] = kwargs.get('train_shape', (10000, 500)) kwargs['from_catboost'] = True fake_lgbm_model.mutate_params(**kwargs) self.params.update(fake_lgbm_model.params) fake_lgbm_model.transcribe_params(params=self.params, **kwargs) self.params.update(fake_lgbm_model.lightgbm_params) get_best = kwargs.get('get_best', True) if get_best is None: get_best = True trial = kwargs.get('trial', False) if trial is None: trial = False # see what else can mutate, need to know things don't want to preserve uses_gpus, n_gpus = self.get_uses_gpus(self.params) if not uses_gpus: colsample_bylevel_list = [0.3, 0.5, 0.9, 1.0] self.params['colsample_bylevel'] = MainModel.get_one( colsample_bylevel_list, get_best=get_best, best_type="first", name="colsample_bylevel", trial=trial) if not (uses_gpus and self.num_classes > 2): boosting_type_list = ['Plain', 'Ordered'] self.params['boosting_type'] = MainModel.get_one( boosting_type_list, get_best=get_best, best_type="first", name="boosting_type", trial=trial) if self._can_handle_categorical: max_cat_to_onehot_list = [ 4, 10, 20, 40, config.max_int_as_cat_uniques ] if uses_gpus: max_one_hot_max_size = 255 else: max_one_hot_max_size = 65535 max_cat_to_onehot_list = sorted( set([ min(x, max_one_hot_max_size) for x in max_cat_to_onehot_list ])) log = True if max(max_cat_to_onehot_list) > 1000 else False self.params['one_hot_max_size'] = MainModel.get_one( max_cat_to_onehot_list, get_best=get_best, best_type="max", name="one_hot_max_size", trial=trial, log=log) if not uses_gpus: sampling_frequency_list = [ 'PerTree', 'PerTreeLevel', 'PerTreeLevel', 'PerTreeLevel' ] self.params['sampling_frequency'] = MainModel.get_one( sampling_frequency_list, get_best=get_best, best_type="first", name="sampling_frequency", trial=trial) bootstrap_type_list = [ 'Bayesian', 'Bayesian', 'Bayesian', 'Bayesian', 'Bernoulli', 'MVS', 'Poisson', 'No' ] if not uses_gpus: bootstrap_type_list.remove('Poisson') if uses_gpus: bootstrap_type_list.remove('MVS') # undocumented CPU only self.params['bootstrap_type'] = MainModel.get_one( bootstrap_type_list, get_best=get_best, best_type="first", name="bootstrap_type", trial=trial) # lgbm usage already sets subsample #if self.params['bootstrap_type'] in ['Poisson', 'Bernoulli']: # subsample_list = [0.5, 0.66, 0.66, 0.9] # # will get pop'ed if not Poisson/Bernoulli # self.params['subsample'] = MainModel.get_one(subsample_list, get_best=get_best, best_type="first", name="subsample", trial=trial) if self.params['bootstrap_type'] in ['Bayesian']: bagging_temperature_list = [0.0, 0.1, 0.5, 0.9, 1.0] self.params['bagging_temperature'] = MainModel.get_one( bagging_temperature_list, get_best=get_best, best_type="first", name="bagging_temperature", trial=trial) # overfit protection different sometimes compared to early_stopping_rounds # self.params['od_type'] # self.params['od_pval'] # self.params['od_wait'] self.params['learning_rate'] = max( config.min_learning_rate, max(self._min_learning_rate_catboost, self.params['learning_rate'])) def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations) def process_cats(self, X, eval_set, orig_cols): # ensure catboost treats as cat by making str if len(self.params['cat_features']) > 0: X = X.to_pandas() if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_X = valid_X.to_pandas() eval_set = [(valid_X, valid_y)] for coli in self.params['cat_features']: col = orig_cols[coli] if 'CatOrig:' in col: cattype = str # must be string for catboost elif 'Cat:' in col: cattype = int else: cattype = str # if was marked as non-numeric, must become string (e.g. for leakage/shift) if cattype is not None: if cattype == int: # otherwise would hit: ValueError: Cannot convert non-finite values (NA or inf) to integer X[col] = X[col].replace([np.inf, -np.inf], np.nan) X[col] = X[col].fillna(value=0) X[col] = X[col].astype(cattype) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] if cattype == int: # otherwise would hit: ValueError: Cannot convert non-finite values (NA or inf) to integer valid_X[col] = valid_X[col].replace( [np.inf, -np.inf], np.nan) valid_X[col] = valid_X[col].fillna(value=0) valid_X[col] = valid_X[col].astype(cattype) eval_set = [(valid_X, valid_y)] return X, eval_set def predict(self, X, y=None, **kwargs): model, features, importances, iterations = self.get_model_properties() if not self._save_by_pickle: from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType if self.num_classes >= 2: from_file = CatBoostClassifier() else: from_file = CatBoostRegressor() with open(self.model_path, mode='wb') as f: f.write(model) model = from_file.load_model(self.model_path) # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up. if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) X, eval_set = self.process_cats(X, None, self.feature_names_fitted) pred_contribs = kwargs.get('pred_contribs', False) output_margin = kwargs.get('output_margin', False) fast_approx = kwargs.pop('fast_approx', False) if fast_approx: iterations = min(config.fast_approx_num_trees, iterations) # implicit import from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool n_jobs = max(1, physical_cores_count) if not pred_contribs and not output_margin: if self.num_classes >= 2: preds = model.predict_proba( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if preds.shape[1] == 2: return preds[:, 1] else: return preds else: return model.predict( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) elif output_margin: # uses "predict" for raw for any class preds = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if len(preds.shape ) > 1 and preds.shape[1] == 2 and self.num_classes == 2: return preds[:, 1] else: return preds elif pred_contribs: # For Shapley, doesn't come from predict # For regression/binary, shap is shape of (rows, features + bias) # for multiclass, shap is shape of (rows, classes, features + bias) data = Pool(X, label=y, cat_features=self.params['cat_features']) if fast_approx: # https://github.com/catboost/catboost/issues/1146 # https://github.com/catboost/catboost/issues/1535 # can't specify trees, but they have approx version # Regular, Exact, or Approximate shap_calc_type = "Approximate" else: shap_calc_type = "Regular" # See also shap_mode # help(CatBoostClassifier.get_feature_importance) print_debug("shap_calc_type: %s" % shap_calc_type) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join( exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid) model.save_model( os.path.join(exp_dir(), "catshapproblem%s.catboost.model" % self.uuid)) # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) save_obj((model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) preds_shap = model.get_feature_importance( data=data, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported, type=EFstrType.ShapValues, shap_calc_type=shap_calc_type, ) # repair broken shap sum: https://github.com/catboost/catboost/issues/1125 print_debug("shap_fix") preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if self.num_classes <= 2: axis = 1 else: axis = 2 orig_sum = np.sum(preds_shap, axis=axis) print_debug("shap_fix2") # avoid division by 0, need different trick, e.g. change baseline, to fix that case if axis == 1: orig_sum[orig_sum[:] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:, None] else: # each feature and each class must sum up orig_sum[orig_sum[:, :] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, :, None] / orig_sum[:, :, None] if config.hard_asserts and config.debug_daimodel_level >= 2: print_debug("shap_check") model.save_model(os.path.join(exp_dir(), "catshapproblem")) pickle.dump((X, y, self.params['cat_features']), open(os.path.join(exp_dir(), "catshapproblem.pkl"), "wb")) preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) assert np.isclose(preds_raw, np.sum( preds_shap, axis=axis)).all( ), "catboost shapley does not sum up correctly" if config.debug_daimodel_level <= 2: remove(pickle_path) if axis == 1: return preds_shap else: # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ... return preds_shap.reshape( preds_shap.shape[0], preds_shap.shape[1] * preds_shap.shape[2]) else: raise RuntimeError("No such case") def transcribe_params(self, params=None, **kwargs): if params is None: params = self.params # reference params = params.copy( ) # don't contaminate DAI params, since we know we use lgbm-xgb as base has_eval_set = self.have_eval_set( kwargs) # only needs (and does) operate at fit-time from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType fullspec_regression = inspect.getfullargspec(CatBoostRegressor) kwargs_regression = { k: v for k, v in zip(fullspec_regression.args, fullspec_regression.defaults) } fullspec_classification = inspect.getfullargspec(CatBoostClassifier) kwargs_classification = { k: v for k, v in zip(fullspec_classification.args, fullspec_classification.defaults) } if self.num_classes == 1: allowed_params = kwargs_regression else: allowed_params = kwargs_classification params_copy = copy.deepcopy(params) for k, v in params_copy.items(): if k not in allowed_params.keys(): del params[k] # now transcribe k = 'boosting_type' if k in params: params[k] = 'Plain' k = 'grow_policy' if k in params: params[ k] = 'Depthwise' if params[k] == 'depthwise' else 'Lossguide' k = 'eval_metric' if k in params and params[k] is not None and params[k].upper( ) == 'AUC': params[k] = 'AUC' map = { 'regression': 'RMSE', 'mse': 'RMSE', 'mae': 'MAE', "mape": 'MAPE', "huber": 'Huber', "fair": 'FairLoss', "rmse": "RMSE", "gamma": "RMSE", # unsupported by catboost "tweedie": "Tweedie", "poisson": "Poisson", "quantile": "Quantile", 'binary': 'Logloss', 'auc': 'AUC', "xentropy": 'CrossEntropy', 'multiclass': 'MultiClass' } k = 'objective' if k in params and params[k] in map.keys(): params[k] = map[params[k]] k = 'eval_metric' if k in params and params[k] is not None and params[k] in map.keys(): params[k] = map[params[k]] if 'objective' in params: # don't randomly choose these since then model not stable GA -> final # but backup shouldn't really be used AFAIK if params['objective'] == 'Huber': backup = float(config.huber_alpha_list[0]) params['delta'] = params.pop('alpha', backup) if params['objective'] == 'Quantile': backup = float(config.quantile_alpha[0]) params['delta'] = params.pop('alpha', backup) if params['objective'] == 'Tweedie': backup = float(config.tweedie_variance_power_list[0]) params['tweedie_variance_power'] = params.pop( 'tweedie_variance_power', backup) if params['objective'] == 'FairLoss': backup = float(config.fair_c_list[0]) params['smoothness'] = params.pop('fair_c', backup) params.pop('verbose', None) params.pop('verbose_eval', None) params.pop('logging_level', None) if 'grow_policy' in params: if params['grow_policy'] == 'Lossguide': params.pop('max_depth', None) if params['grow_policy'] == 'Depthwise': params.pop('num_leaves', None) else: params['grow_policy'] = 'SymmetricTree' uses_gpus, n_gpus = self.get_uses_gpus(params) if params['task_type'] == 'CPU': params.pop('grow_policy', None) params.pop('num_leaves', None) params.pop('max_leaves', None) params.pop('min_data_in_leaf', None) params.pop('min_child_samples', None) if params['task_type'] == 'GPU': params.pop('colsample_bylevel', None) # : 0.35 if 'grow_policy' in params and params['grow_policy'] in [ 'Depthwise', 'SymmetricTree' ]: if 'max_depth' in params and params['max_depth'] in [0, -1]: params['max_depth'] = max( 2, int(np.log(params.get('num_leaves', 2**6)))) else: params.pop('max_depth', None) params.pop('depth', None) if 'grow_policy' in params and params['grow_policy'] == 'Lossguide': # if 'num_leaves' in params and params['num_leaves'] == -1: # params['num_leaves'] = 2 ** params.get('max_depth', 6) if 'max_leaves' in params and params['max_leaves'] in [0, -1]: params['max_leaves'] = 2**params.get('max_depth', 6) else: params.pop('max_leaves', None) if 'num_leaves' in params and 'max_leaves' in params: params.pop('num_leaves', None) # apply limits if 'max_leaves' in params: params['max_leaves'] = min(params['max_leaves'], 65536) if 'max_depth' in params: params['max_depth'] = min(params['max_depth'], 16) params.update({ 'train_dir': user_dir(), 'allow_writing_files': False, 'thread_count': self.params_base.get('n_jobs', 4) }) if 'reg_lambda' in params and params['reg_lambda'] <= 0.0: params['reg_lambda'] = 3.0 # assume meant unset if self._can_handle_categorical: if 'max_cat_to_onehot' in params: params['one_hot_max_size'] = params['max_cat_to_onehot'] params.pop('max_cat_to_onehot', None) if uses_gpus: params['one_hot_max_size'] = min( params.get('one_hot_max_size', 255), 255) else: params['one_hot_max_size'] = min( params.get('one_hot_max_size', 65535), 65535) if 'one_hot_max_size' in params: params['one_hot_max_size'] = max(self._min_one_hot_max_size, params['one_hot_max_size']) params['max_bin'] = params.get('max_bin', 254) if params['task_type'] == 'CPU': params['max_bin'] = min( params['max_bin'], 254) # https://github.com/catboost/catboost/issues/1010 if params['task_type'] == 'GPU': params['max_bin'] = min( params['max_bin'], 127) # https://github.com/catboost/catboost/issues/1010 if uses_gpus: # https://catboost.ai/docs/features/training-on-gpu.html params['devices'] = "%d-%d" % (self.params_base.get( 'gpu_id', 0), self.params_base.get('gpu_id', 0) + n_gpus - 1) #params['gpu_ram_part'] = 0.3 # per-GPU, assumes GPU locking or no other experiments running if self.num_classes > 2: params.pop("eval_metric", None) params['train_dir'] = self.context.experiment_tmp_dir params['allow_writing_files'] = False # assume during fit self.params_base could have been updated assert 'n_estimators' in params assert 'learning_rate' in params params['n_estimators'] = self.params_base.get('n_estimators', 100) params['learning_rate'] = self.params_base.get( 'learning_rate', config.min_learning_rate) params['learning_rate'] = min( params['learning_rate'], 0.5) # 1.0 leads to illegal access on GPUs params['learning_rate'] = max( config.min_learning_rate, max(self._min_learning_rate_catboost, params['learning_rate'])) if 'early_stopping_rounds' not in params and has_eval_set: params['early_stopping_rounds'] = 150 # temp fix # assert 'early_stopping_rounds' in params if uses_gpus: params.pop('sampling_frequency', None) if not uses_gpus and params['bootstrap_type'] == 'Poisson': params['bootstrap_type'] = 'Bayesian' # revert to default if uses_gpus and params['bootstrap_type'] == 'MVS': params['bootstrap_type'] = 'Bayesian' # revert to default if 'bootstrap_type' not in params or params['bootstrap_type'] not in [ 'Poisson', 'Bernoulli' ]: params.pop( 'subsample', None) # only allowed for those 2 bootstrap_type settings if params['bootstrap_type'] not in ['Bayesian']: params.pop('bagging_temperature', None) if not (self.num_classes == 2 and params['objective'] == 'Logloss'): params.pop('scale_pos_weight', None) # go back to some default eval_metric if self.num_classes == 1: if 'eval_metric' not in params or params['eval_metric'] not in [ 'MAE', 'MAPE', 'Poisson', 'Quantile', 'RMSE', 'LogLinQuantile', 'Lq', 'Huber', 'Expectile', 'FairLoss', 'NumErrors', 'SMAPE', 'R2', 'MSLE', 'MedianAbsoluteError' ]: params['eval_metric'] = 'RMSE' elif self.num_classes == 2: if 'eval_metric' not in params or params['eval_metric'] not in [ 'Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1', 'BalancedAccuracy', 'BalancedErrorRate', 'MCC', 'Accuracy', 'CtrFactor', 'AUC', 'NormalizedGini', 'BrierScore', 'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'LogLikelihoodOfPrediction' ]: params['eval_metric'] = 'Logloss' else: if 'eval_metric' not in params or params['eval_metric'] not in [ 'MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall', 'F1', 'TotalF1', 'MCC', 'Accuracy', 'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'AUC' ]: params['eval_metric'] = 'MultiClass' # set system stuff here params['silent'] = self.params_base.get('silent', True) if config.debug_daimodel_level >= 1: params[ 'silent'] = False # Can enable for tracking improvement in console/dai.log if have access params['random_state'] = self.params_base.get('random_state', 1234) params['thread_count'] = self.params_base.get( 'n_jobs', max(1, physical_cores_count)) # -1 is not supported return params def get_uses_gpus(self, params): params['task_type'] = 'CPU' if self.params_base.get('n_gpus', 0) == 0 else 'GPU' if self._force_gpu: params['task_type'] = 'GPU' n_gpus = self.params_base.get('n_gpus', 0) if self._force_gpu: n_gpus = 1 if n_gpus == -1: n_gpus = ngpus_vis uses_gpus = params['task_type'] == 'GPU' and n_gpus > 0 return uses_gpus, n_gpus
logging_level='Verbose') model.fit(X_train, y_train, eval_set=(X_eval, y_eval), cat_features=categorical_features_indices, plot=True) # 输出各特征重要度 feature_names = X_train.columns print( pd.DataFrame({ 'column': feature_names, 'importance': model.get_feature_importance(), }).sort_values(by='importance', ascending=False)) print(model.get_best_iteration()) # 训练集和验证集的预测 y_train_prob = model.predict_proba(X_train, ntree_end=model.get_best_iteration())[:, 1] y_eval_prob = model.predict_proba(X_eval, ntree_end=model.get_best_iteration())[:, 1] # AUC指标 print('AUC') print(roc_auc_score(y_train, y_train_prob)) print(roc_auc_score(y_eval, y_eval_prob)) # 本题采用的指标 print('本题采用的指标') print(cal_metric(y_train, y_train_prob))
model = CatBoostClassifier( iterations=1000000, depth=10, #l2_leaf_reg = 10, #border_count=254, verbose=True, use_best_model=True, scale_pos_weight=scale_pos_weight, eval_metric='Precision', thread_count=int(cpus / 2), loss_function='Logloss') # Train the model on training data model.fit(cattrain, eval_set=cattest, plot=False) print(model.get_best_iteration()) #Pecision metrics train_precision = model.eval_metrics(cattrain, "Precision") print("train precision", train_precision.get("Precision")[model.get_best_iteration()]) ptrain = train_precision.get("Precision")[model.get_best_iteration()] test_precision = model.eval_metrics(cattest, "Precision") print("test precision", test_precision.get("Precision")[model.get_best_iteration()]) ptest = test_precision.get("Precision")[model.get_best_iteration()] valid_precision = model.eval_metrics(catvalid, "Precision") print("valid precision", valid_precision.get("Precision")[model.get_best_iteration()])
def search_CatBoost_parameters(config: dict, train_dataset: MusicDataset, val_dataset: MusicDataset = None, internal_cv=False): """ Fit a CatBoostClassifier using train and validation set Returns: - a list of the names of the parameters - a list of tried parameter configurations - a list of corresponding results """ # Get parameters if (type(config[_n_iterations_key]) == list): iterations = np.arange(config[_n_iterations_key][0], config[_n_iterations_key][1], config[_n_iterations_key][2]) else: iterations = config[_n_iterations_key] if (type(config[_learning_rate_key]) == list): learning_rates = np.arange(config[_learning_rate_key][0], config[_learning_rate_key][1], config[_learning_rate_key][2]) else: learning_rates = config[_learning_rate_key] loss_function = config.get("loss_function", "CrossEntropy") parameter_names = [] parameter_sets = [] results = [] # Get data _, X_train, y_train = train_dataset.get_whole_dataset_as_pd() if (val_dataset != None): _, X_val, y_val = val_dataset.get_whole_dataset_as_pd() # GPU if (torch.cuda.is_available()): task_type = 'GPU' devices = str(torch.cuda.current_device()) else: task_type = 'CPU' devices = None if (not internal_cv): # No internal cross validation during training for i_it, it in enumerate(iterations): for i_lr, lr in enumerate(learning_rates): model = CatBoostClassifier(iterations=it, learning_rate=lr, loss_function=loss_function, task_type=task_type, devices=devices, custom_metric=['Accuracy']) model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=10) params = model.get_params() parameter_names = list(params.keys()) parameter_sets.append(list(params.values())) best_score = model.get_best_score() results.append(best_score['validation']['Accuracy']) best_iter = model.get_best_iteration() print("Best iteration: " + str(best_iter)) else: # Use catboost cross validation procedure params = {} params['loss_function'] = loss_function params['iterations'] = iterations params['custom_metric'] = 'Accuracy' params['task_type'] = task_type params['devices'] = devices best_value = 0.0 best_iter = 0 for i_lr, lr in enumerate(learning_rates): params['learning_rate'] = lr cv_data = cv(params=params, pool=Pool(X_train, label=y_train), fold_count=5, shuffle=True, partition_random_seed=0, plot=True, stratified=False, verbose=50) res_value = np.max(cv_data['test-Accuracy-mean']) res_iter = np.argmax(cv_data['test-Accuracy-mean']) params['best_iteration'] = res_iter print( f"Best iteration for lr {lr}: {res_iter} with val accuracy {res_value}" ) results.append(res_value) parameter_sets.append(list(params.values())) parameter_names = list(params.keys()) # Remove entry from dict since it is used as input for cv again params.pop('best_iteration') return parameter_names, parameter_sets, results
if 'time' in input_columns_classify: cat_features.append('time') clas.fit(select_input_columns_classify(tr), select_output_columns_as_row_classify(tr), eval_set=(select_input_columns_classify(ts), select_output_columns_as_row_classify(ts)), use_best_model=True, verbose=True, early_stopping_rounds=2000, cat_features=cat_features) ts['plabel'] = pd.Series(clas.predict(ts_cl_in)) oo['prediction'] = pd.Series(clas.predict(oo_cl_in)) fname = str(clas.best_score_['validation']['F1']) + "_" + str( int(time.time() * 1000)) + str(np.random.randint(0, 9)) + "_" + str( reg.get_best_iteration()) + "_" + str(clas.get_best_iteration()) oo[['tripid', 'prediction']].to_csv("./results/" + fname + ".csv", index=False) ts.to_csv("./tss/" + fname + ".csv", index=False) clas.save_model("./models/" + fname) logged = fname + " " + "-".join(input_columns_classify) + "_" + str( reg.get_best_iteration()) + "_" + str(clas.get_best_iteration()) + "\n" with open("log.txt", "a") as log: log.write(logged) print(fname) # id = str(int(time.time() * 1000)) # def par(): # global id
def train_1fold(self, fold, params, params_custom): X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold) cat_feature_idx = [] for i, c in enumerate(X_train): if not is_numeric_dtype(X_train[c]): cat_feature_idx.append(i) if fold == 0: X_train.dtypes.to_csv(self.models_path + "/dtypes.csv") logger.info(f"X_train.shape = {X_train.shape}") params2 = copy.deepcopy(params) if params2["random_seed"] is not None: params2["random_seed"] = params2["random_seed"] + fold logger.info(f"Set catboost train random_seed = {params2['random_seed']}") model = CatBoostClassifier(**params2) model.fit( X_train, y_train, cat_features=cat_feature_idx, eval_set=(X_valid, y_valid) ) model.save_model(self.models_path + f'/model-catboost-f{fold:02d}.bin') util.dump_json(model.get_all_params(), self.models_path + "/params.json") evals = model.get_evals_result() evals_df = pd.DataFrame({ f"logloss_train_f{fold:02d}":evals["learn"]['Logloss'], f"accuracy_train_f{fold:02d}":evals["learn"]['Accuracy'], f"logloss_valid_f{fold:02d}":evals['validation']['Logloss'], f"accuracy_valid_f{fold:02d}":evals['validation']['Accuracy'] }) self.evals_df.append(evals_df) preds_valid = model.predict_proba(X_valid)[:,1] logger.info(f"len(vdx)={len(vdx)} len(preds_valid)={len(preds_valid)}") self.preds_valid_all.loc[vdx, "pred"] = preds_valid preds_train = model.predict_proba(X_train)[:,1] self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx)) preds_test = model.predict_proba(X_test)[:,1] self.preds_test_all.append(preds_test) acc_valid = accuracy_score(y_valid, np.round(preds_valid)) acc_train = accuracy_score(y_train, np.round(preds_train)) logloss_valid = log_loss(y_valid, preds_valid) logloss_train = log_loss(y_train, preds_train) ms = [fold, acc_train, acc_valid, logloss_train, logloss_valid, model.get_best_iteration()] self.mets.append(ms) show_mets(*ms) for it in ["FeatureImportance"]: imp = pd.Series(model.get_feature_importance(type=it), index=X_train.columns) imp.name = fold imp.index.name = "feature" self.importance[it].append(imp)
def main(data_folder): print('Models: ', MODELS_LIST) train_shapefile , test_shapefile, train_features_df, test_features_df = load_data(data_folder, MODELS_LIST) train_shapefile['roof_type_id'] = train_shapefile.roof_material.astype("category").cat.codes folds = pd.read_csv(f'{data_folder}/8Kfolds_201910302225.csv') folds = folds[folds.id.isin(train_shapefile.index.tolist())] additional_train_features_df, additional_test_features_df, train_shapefile, test_shapefile =\ create_additional_features(folds, train_shapefile, test_shapefile) # Combine field level additional features and predictions from 1st level train_val_data = pd.concat((additional_train_features_df, train_features_df), axis=1) columns = train_val_data.columns train_val_data['roof_type_id'] = train_shapefile.roof_type_id print(train_val_data.head(3)) test_data = pd.concat((additional_test_features_df, test_features_df), axis=1) print(test_data.head(3)) all_valid_y = [] all_valid_field_ids = [] all_predicts = [] all_valid_predicts = [] for fold in sorted(folds.fold.unique()): train_field_ids_i = folds.loc[folds.fold != fold, 'id'].values val_field_ids_i = folds.loc[folds.fold == fold, 'id'].values X_train, X_valid = train_val_data.loc[train_field_ids_i, columns].values, train_val_data.loc[ val_field_ids_i, columns].values y_train, y_valid = train_val_data.loc[train_field_ids_i, 'roof_type_id'].values, train_val_data.loc[ val_field_ids_i, 'roof_type_id'].values model_cat = CatBoostClassifier( iterations=10000, random_seed=111, learning_rate=0.01, logging_level='Silent', task_type="GPU", devices='0', max_depth=5, l2_leaf_reg=3.0, bagging_temperature=1, ) model_cat.fit( X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[0]) print(f"Fold - {fold} - {model_cat.get_best_score()['validation']['MultiClass']:.4}@{model_cat.get_best_iteration()}") ypred = model_cat.predict_proba(test_data, ntree_end=model_cat.get_best_iteration()) y_valid_pred = model_cat.predict_proba(X_valid, ntree_end=model_cat.get_best_iteration()) all_valid_field_ids.extend(val_field_ids_i) all_predicts.append(ypred) all_valid_y.extend(y_valid) all_valid_predicts.append(y_valid_pred) calc_local_loss(all_valid_predicts, all_valid_y) save_final_predictions(OUTPUT_NAME, all_predicts, test_shapefile, all_valid_predicts, all_valid_field_ids)
def gridsearch_early_stopping(cv, X, y, folds, grid, cat_features=None, save=None): ''' Perform grid search with early stopping across folds specified by index Parameters ----------- cv: cross validation X: DataFrame or Numpy array y: DataFrame or Numpy array fold: list of fold indexes grid: parameter grid save: string, excluding file extension (default=None) saves results_df for each fold to folder '../../data/interim' ''' if np.unique(y).size <= 2: loss_function = 'Logloss' else: loss_function = 'MultiClass' # generate data folds train_X, train_y, test_X, test_y = generate_folds(cv, X, y) # iterate through specified folds for fold in folds: # assign train and test pools test_pool = Pool(data=test_X[fold], label=test_y[fold], cat_features=cat_features) train_pool = Pool(data=train_X[fold], label=train_y[fold], cat_features=cat_features) # creating results_df dataframe results_df = pd.DataFrame(columns=[ 'params' + str(fold), loss_function + str(fold), 'Accuracy' + str(fold), 'iteration' + str(fold) ]) best_score = 99999 # iterate through parameter grid for params in ParameterGrid(grid): # create catboost classifer with parameter params model = CatBoostClassifier( cat_features=cat_features, early_stopping_rounds=50, task_type='GPU', custom_loss=['Accuracy'], iterations=3000, #class_weights=weights, **params) # fit model model.fit(train_pool, eval_set=test_pool, verbose=400) # append results to results_df print(model.get_best_score()['validation']) results_df = results_df.append( pd.DataFrame([[ params, model.get_best_score()['validation'][loss_function], model.get_best_score()['validation']['Accuracy'], model.get_best_iteration() ]], columns=[ 'params' + str(fold), loss_function + str(fold), 'Accuracy' + str(fold), 'iteration' + str(fold) ])) # save best score and parameters if model.get_best_score( )['validation'][loss_function] < best_score: best_score = model.get_best_score( )['validation'][loss_function] best_grid = params print("Best logloss: ", best_score) print("Grid:", best_grid) save_file(results_df, save + str(fold) + '.joblib', dirName='../../models') display(results_df)