def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test, **kwargs): clf = CatBoostClassifier(thread_count=30) # TODO: embedding_features if self.params is not None: clf.set_params(**self.params) # print(clf.get_params()) # eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf = clf.fit( X_train, y_train, eval_set=( X_valid, y_valid ), # CatBoostError: Multiple eval sets are not supported on GPU # Only one of parameters ['verbose', 'logging_level', 'verbose_eval', 'silent'] should be set verbose=100, early_stopping_rounds=100, use_best_model=True, plot=True, **kwargs) # evals_result = self.clf.evals_result() valid_predict = clf.predict_proba(X_valid) test_predict = clf.predict_proba(X_test) return valid_predict, test_predict
def _test_prediction_consistency(leaf_method): base_dir = 'data/adult/' train_documents, train_targets = read_train_documents_and_one_hot_targets( base_dir + 'train_data_catboost_format.tsv' ) train_targets = np.argmax(train_targets, axis=1) test_documents, test_targets = read_train_documents_and_one_hot_targets( base_dir + 'train_data_catboost_format.tsv' ) train_dir = base_dir + 'ut_tmp/' if not isdir(train_dir): mkdir(train_dir) cbc_params = read_json_params(base_dir + 'catboost_params.json') cbc_params['leaf_estimation_method'] = leaf_method cbc_params['random_seed'] = 10 cbc_params['train_dir'] = train_dir cbc = CatBoostClassifier(**cbc_params) cbc.set_params(boosting_type='Plain') cbc.fit(train_documents, train_targets) cbc.save_model(train_dir + 'model.bin', format='cbm') export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json') full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets, learning_rate=cbc_params['learning_rate'], loss_function=BinaryCrossEntropyLoss(), leaf_method=leaf_method, update_set='AllPoints') assert np.allclose(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'), rtol=0.001),\ [(a,b) for a, b in zip(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal')) if not np.allclose(a, b)] assert np.allclose(full_model(test_documents), cbc.predict(test_documents, prediction_type='RawFormulaVal'), rtol=0.001)
def catboost_fit_predict(train_documents, train_targets, test_documents, prediction_type='RawFormulaVal', **catboost_params): if 'gpu_ram_part' in catboost_params: gpu_ram_part = catboost_params.pop('gpu_ram_part') else: gpu_ram_part = None cbc = CatBoostClassifier(**catboost_params) if gpu_ram_part is not None: cbc.set_params(gpu_ram_part=gpu_ram_part) cbc.fit(train_documents, train_targets) return cbc.predict(test_documents, prediction_type)
def _test_influence_vs_tf_derivative(leaf_method): base_dir = 'data/adult/' train_documents, train_targets = read_train_documents_and_one_hot_targets( base_dir + 'train_data_catboost_format.tsv' ) train_documents = train_documents[:100] train_targets = train_targets[:100] train_targets = np.argmax(train_targets, axis=1) test_documents, test_targets = read_train_documents_and_one_hot_targets( base_dir + 'test_data_catboost_format.tsv' ) test_targets = np.argmax(test_targets, axis=1) train_dir = base_dir + 'ut_tmp/' if not isdir(train_dir): mkdir(train_dir) cbc_params = read_json_params(base_dir + 'catboost_params.json') cbc_params['iterations'] = 2 cbc_params['leaf_estimation_method'] = leaf_method cbc_params['random_seed'] = 10 cbc_params['train_dir'] = train_dir cbc = CatBoostClassifier(**cbc_params) cbc.set_params(boosting_type='Plain') cbc.fit(train_documents, train_targets) cbc.save_model(train_dir + 'model.bin', format='cbm') export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json') full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets, leaf_method=leaf_method, learning_rate=cbc_params['learning_rate'], loss_function=BinaryCrossEntropyLoss(), update_set='AllPoints') retrained_model_our = deepcopy(full_model) tf_checker = TFGBApplier(full_model, train_documents, train_targets, leaf_method) for remove_idx in np.random.randint(len(train_targets), size=30): full_model.fit(remove_idx, retrained_model_our) pred_ours = full_model(train_documents) pred_theirs = tf_checker.get_predicts() pred_cbc = cbc.predict(train_documents, prediction_type='RawFormulaVal') assert np.allclose(pred_ours, pred_theirs, rtol=1e-3) and np.allclose(pred_ours, pred_cbc, rtol=1e-3), (pred_ours, pred_theirs) der_ours = [t.leaf_values for t in retrained_model_our.influence_trees] der_theirs = tf_checker.get_derivs(remove_idx) assert all(np.allclose(o, t, rtol=1e-2) for o, t in zip(der_ours, der_theirs)), (der_ours, der_theirs) random_train_idx = np.random.randint(len(train_targets)) der_pred_ours = retrained_model_our.loss_derivative(train_documents[[random_train_idx]], train_targets[[random_train_idx]])[0] der_pred_theirs = tf_checker.get_train_prediction_deriv(remove_idx, random_train_idx) assert np.isclose(der_pred_ours, der_pred_theirs, rtol=1e-2), (der_pred_ours, der_pred_theirs)
class CatboostEnsemble(Ensemble): def __init__(self, params: dict, dataset: Dataset = None): super().__init__(params, dataset, name='CatboostEnsemble') self.clf = CatBoostClassifier(**params) self.tmp_json_path = '/tmp/catboost.model.json' def fit(self, dataset: Dataset): self.set_dataset(dataset) loss_function = 'MultiClass' if self.dataset.num_classes( ) > 2 else 'Logloss' self.clf.set_params(loss_function=loss_function, verbose=False) self.clf.fit(self.dataset.X, self.dataset.y) self.clf.save_model(self.tmp_json_path, format='json') with open(self.tmp_json_path, 'r') as fp: model = json.load(fp) self.trees = [ CatboostTree.parse(tree, self.dataset) for tree in model['oblivious_trees'] ] def predict_proba(self, dataset: Dataset) -> np.ndarray: if len(self.trees) == 0: raise ValueError('There are no trees available') encoded_dataset = self.encode_dataset(dataset) n_classes = len(self.clf.classes_) # pylint: disable=no-member # TODO: For single tree this is just [tree.predict(...)] preds = np.array( [tree.predict(encoded_dataset.X) for tree in self.trees]) preds = np.sum(preds, axis=0) if n_classes > 2: # https://catboost.ai/docs/concepts/loss-functions-multiclassification.html # Link above suggests different equation for this # results_proba = softmax(preds, axis=1) raise NotImplementedError('Only binary problems are implemented.') else: results_proba = np.array([[1 - v, v] for v in expit(preds)]) return results_proba def predict(self, dataset: Dataset) -> np.ndarray: results_proba = self.predict_proba(dataset) results_cls = np.argmax(results_proba, axis=1) return results_cls
class CatBoost: _verbose = 200 _train_dir = DATA_CACHE_DIR _is_gpu_available = get_gpu_device_count() _task_type = "GPU" if _is_gpu_available > 0 else None _devices = "GPU" if _is_gpu_available > 0 else None def __init__(self, model_id, num_input_features, num_output_classes, model_save_path, **aux_params): self.model = CatBoostClassifier(loss_function="MultiClass", task_type=self._task_type, devices=self._devices, train_dir=self._train_dir, random_seed=SEED) self.model.set_params(**aux_params) self.model_id = model_id path = f"{model_save_path}/{model_id}" os.makedirs(path, exist_ok=True) self.model_path = path self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME) def load(self): self.model.load_model(self.modelfile_save_path) def save(self): self.model.save_model(self.modelfile_save_path) def fit(self, X_train, y_train, X_valid, y_valid): self.model.fit(Pool(X_train, y_train), eval_set=(X_valid, y_valid), use_best_model=True, verbose=self._verbose) self.save() def predict(self, X, load=False): if load: self.load() return self.model.predict_proba(X) def explain(self, X_train, y_train, features, classes): importances = self.model.get_feature_importance( data=Pool(X_train, y_train)) plot_importance(importances, features, self.model_path, self.model_id)
def modelCatBoostClassifier(self, trial: optuna.trial.Trial): opt_params = dict( num_leaves=trial.suggest_int("num_leaves", 2, 2**8), learning_rate=trial.suggest_discrete_uniform( 'learning_rate', 0.001, 1, 0.001), n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True), min_child_samples=trial.suggest_int('min_child_samples', 2, 2**8), min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8, 1), min_split_gain=trial.suggest_loguniform('min_split_gain', 1e-8, 1), subsample=trial.suggest_uniform('subsample', 0.4, 1), subsample_freq=trial.suggest_int("subsample_freq", 0, 2**4), colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.4, 1), reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10), reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10), ) clf = CatBoostClassifier() clf.set_params(**{**opt_params, **self.params}) return clf
class CatBoostClassifierModel(BaseModel): def __init__(self, categorical_features_indices, params): super().__init__(params) self.categorical_features_indices = categorical_features_indices self.name = 'CatBoostClassifier' self.cv = StratifiedKFold(5, shuffle=True, random_state=1) self.metrics = { 'QWK': qwk_score, } def fit(self, X, y): train_pool = Pool(X, y, cat_features=self.categorical_features_indices) self.model = CatBoostClassifier() self.model.set_params(**self.params) self.model.fit(train_pool) return self.model def extract_shap_values(self, X): explainer = shap.TreeExplainer(self.model) shap_values = explainer.shap_values( Pool(X, cat_features=self.categorical_features_indices)) return shap_values
model_catboost_val = CatBoostClassifier( eval_metric='AUC', iterations=20000, # Very high value, to find the optimum od_type='Iter', # Overfitting detector set to "iterations" or number of trees random_seed=RS, # Random seed for reproducibility verbose=100) # Shows train/test metric every "verbose" trees # "Technical" parameters of the model: params = {'objective': 'Logloss', 'learning_rate': 0.01, # learning rate, lower -> slower but better prediction 'depth': 5, # Depth of the trees (values betwwen 5 and 10, higher -> more overfitting) 'l2_leaf_reg': 10, # L2 regularization (between 3 and 20, higher -> less overfitting) 'rsm': 0.7, # % of features to consider in each split (lower -> faster and reduces overfitting) 'bootstrap_type': 'Bayesian'} # For categorical variables model_catboost_val.set_params(**params) print('\nCatboost Fit (Validation)...\n') model_catboost_val.fit(X=pool_tr, eval_set=pool_val, early_stopping_rounds=esr) # 2) Cross-Validation (Catboost) ################################################################################ # 2.1) k-Fold Cross-Validation Function ################################################################################ from sklearn.model_selection import StratifiedKFold def Model_cv(MODEL, k, X_train, X_test, y, RE, makepred=True, CatPos=None): # Create the k folds
class CatBoostWrapper: params: Dict[Union[str, Any], Union[Union[str, float, int], Any]] def __init__(self): self.base_params = dict() self.base_params["iterations"] = 400 # self.base_params["used_ram_limit"] = '512mb' self.base_params["one_hot_max_size"] = 10 self.base_params["nan_mode"] = 'Min' self.base_params["depth"] = 5 self.base_params["learning_rate"] = 0.01 self.base_params["random_strength"] = 1.5 self.base_params["bagging_temperature"] = 1.5 self.params = deepcopy(self.base_params) max_depth_list = (list(range(2, 11))) self.param_space = { 'depth': max_depth_list, 'learning_rate': (0.005, 0.01, 0.05, 0.1, 0.3) } self.param_rules = {} self.hyperopt_param_space = { # 'num_leaves': hp.choice('num_leaves', [5,10,20,30,50,70,100]), # 'subsample': hp.choice('subsample', [0.7,0.8,0.9,1]), # 'colsample_bytree': hp.choice('colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1]), # 'min_child_weight': hp.choice('min_child_weight', [5,10,15,20,30,50]), # 'learning_rate': hp.choice('learning_rate', [0.02,0.03,0.05,0.07,0.1,0.2]) 'depth': hp.choice('depth', max_depth_list), 'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.3)) } self.estimator = None self.mode = None self.category_indices = None def get_regressor(self, category_indices): self.params["loss_function"] = "RMSE" self.mode = 'regression' self.estimator = CatBoostRegressor(**self.params) return self.get_model(category_indices) def get_classifier(self, category_indices): self.mode = 'classification' self.estimator = CatBoostClassifier(**self.params) return self.get_model(category_indices) def get_model(self, category_indices): # self.estimator.set_params(**params) self.category_indices = category_indices return Model(self, self.param_space, self.param_rules) def set_params(self, params): for key in params.keys(): self.params[key] = params[key] self.estimator.set_params(**self.params) def set_final_params(self): pass # self.set_params({'learning_rate': 0.001}) # self.num_iterations = 600 # self.set_params({'learning_rate': 0.01}) def fit(self, x, y=None): if self.mode == 'classification': pos_weight = x[y < 0.5].shape[0] / x[y > 0.5].shape[0] self.set_params({"scale_pos_weight": pos_weight}) self.estimator.fit(x, y, logging_level='Silent', use_best_model=True) def predict(self, x): return self.estimator.predict(x)
class CatBoostClassifierCV(object): """cross_val_predict""" def __init__(self, params=None, cv=5, random_state=None, n_repeats=None): self.clf = CatBoostClassifier() if params: self.clf.set_params(**params) if n_repeats: self._kf = RepeatedStratifiedKFold(cv, True, random_state) self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, True, random_state) self._num_preds = cv def fit(self, X, y, X_test, feval=roc_auc_score, cat_features=None, sample_weight=None, verbose=100, early_stopping_rounds=100, plot=False, silent=None, logging_level=None, column_description=None, save_snapshot=None, snapshot_file='/fds/data' if cloudml else None, snapshot_interval=None, init_model=None): """输入数组""" self.oof_train = np.zeros(len(X)) self.oof_test = np.zeros((len(X_test), self._num_preds)) for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)): if verbose: print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime())) X_train, y_train = X[train_index], y[train_index] X_valid, y_valid = X[valid_index], y[valid_index] # eval_set = [(X_train, y_train), (X_valid, y_valid)] ######################################################################## self.clf.fit(X_train, y_train, cat_features=cat_features, sample_weight=sample_weight, use_best_model=True, eval_set=(X_valid, y_valid), verbose=verbose, logging_level=logging_level, plot=plot, column_description=column_description, silent=silent, early_stopping_rounds=early_stopping_rounds, save_snapshot=save_snapshot, snapshot_file=snapshot_file, snapshot_interval=snapshot_interval, init_model=init_model) self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1] self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1] ######################################################################## # 输出 测试集 oof self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len( self.oof_test) self.oof_test = self.oof_test.mean(1) # 计算 训练集 oof 得分 if feval: score = feval(y, self.oof_train) print( f"\n\033[94mCV Score: {score} ended at {time.ctime()}\033[0m") return score def oof_save(self, file='./oof_train_and_test.csv'): assert isinstance(file, str) _ = np.append(self.oof_train, self.oof_test) pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
study.optimize(objective, timeout=60 * 60 * 12) joblib.dump(study, 'study_{}.pkl'.format(BOOSTING)) best_params = study.best_params else: best_params = { 'loss_function': 'Logloss', 'custom_loss': ['AUC'], 'logging_level': 'Silent', 'early_stopping_rounds': 100 } # %% model.set_params(**best_params) # %% seed_everything(RANDOM_STATE) xx = cross_val_score_auc(model, X_train, y_train, n_fold=N_FOLD, random_state=RANDOM_STATE, predict=True, X_test=X_test, shuffle=True, split_type='stratifiedkfold', return_to_stack=True, submission=sample_submission)
class CatBoost: def __init__(self, target, features, weight=None, mode='Regressor', objective='RMSE', logs=True): self.model = None self.target = target self.features = features self.mode = mode self.weight = weight self.logs = logs self.model_params = dict( thread_count=8, iterations=2000, loss_function=objective, # learning_rate=0.05 ) self.training_params = dict(use_best_model=True, early_stopping_rounds=100, verbose=100) def _set_model_(self): if self.mode == 'Regressor': self.model = CatBoostRegressor() self.model.set_params(**self.model_params) elif self.mode == 'Classifier': self.model = CatBoostClassifier() self.model.set_params(**self.model_params) else: raise Exception('Unknown mode %s' % self.mode) def train_with_valid(self, XY): X_train, Y_train = XY.train[self.features], XY.train[self.target] X_valid, Y_valid = XY.valid[self.features], XY.valid[self.target] if self.weight is None: train_pool = Pool(data=X_train, label=Y_train) val_pool = Pool(data=X_valid, label=Y_valid) else: W_train, W_valid = XY.train[self.weight], XY.valid[self.weight] train_pool = Pool(data=X_train, label=Y_train, weight=W_train) val_pool = Pool(data=X_valid, label=Y_valid, weight=W_valid) '''logging''' print('Training Model CatBoost with validation') print('X_train = %s Y_train = %s' % (X_train.shape, Y_train.shape)) print('X_valid = %s Y_valid = %s' % (X_valid.shape, Y_valid.shape)) print() '''training''' self._set_model_() self.model = self.model.fit(train_pool, eval_set=val_pool, **self.training_params) '''feature importances''' if self.logs: self._logging_feature_importance_(train_pool) def predict(self, X): X = X[self.features] if self.model is None: raise Exception('Train your model before') print('Predicting Model CatBoost') print('X = %s' % (X.shape, )) print() data_pool = Pool(data=X) '''predict''' if self.mode == 'Regressor': prediction = self.model.predict(data_pool) elif self.mode == 'Classifier': prediction = self.model.predict(data_pool, prediction_type='Probability') prediction = prediction[:, 1] prediction = pd.DataFrame(prediction, index=X.index, columns=[self.target]) return prediction def _logging_feature_importance_(self, train_pool): if self.model is None: raise Exception('Train your model before') print('Top features') feature_importance = self.model.get_feature_importance(train_pool) feature_names = train_pool.get_feature_names() for score, name in sorted(zip(feature_importance, feature_names), reverse=True): print('{}: {}'.format(name, score))