def model_fit_with(train_set, test_sets, cd_file): model = CatBoost({'use_best_model': False, 'loss_function': 'RMSE', 'iterations': 12, 'random_seed': 0}) model.fit(train_set, eval_set=test_sets, column_description=cd_file) return model
def fit_catboost(X, y, cv=None, params: dict = None, verbose=500): if params is None: params = deepcopy(CAT_DEFAULT_PARAMS) if cv is None: cv = StratifiedKFold(n_splits=2, shuffle=True) models = [] # training data の target と同じだけのゼロ配列を用意 # float にしないと悲しい事件が起こるのでそこだけ注意 oof_pred = np.zeros_like(y, dtype=np.float) for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)): # この部分が交差検証のところです。データセットを cv instance によって分割します # training data を trian/valid に分割 x_train, y_train = X[idx_train], y[idx_train] x_valid, y_valid = X[idx_valid], y[idx_valid] clf = CatBoost(params=params) with timer(prefix='fit fold={} '.format(i + 1)): clf_train = Pool(x_train, y_train) clf_val = Pool(x_valid, y_valid) clf.fit(clf_train, eval_set=[clf_val]) pred_i = clf.predict(x_valid, prediction_type='Probability')[:, 1] oof_pred[idx_valid] = pred_i models.append(clf) print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}') score = roc_auc_score(y, oof_pred) print('FINISHED \ whole score: {:.4f}'.format(score)) return oof_pred, models, score
def show_shap_summary(dataset, labels): # Create the embedding embedder = SNoRe() embedding = embedder.embed(dataset) node_indexes = embedder.selected_features # Create the classification (regression) model # We used CatBoost instead of XGBoost (used in the paper), because is simpler to setup since XGBoost has some bugs classifier = CatBoost(params={'loss_function': 'MultiRMSE', "iterations": 250}) if sparse.issparse(embedding): df = pd.DataFrame.sparse.from_spmatrix( embedding, columns=["node " + str(node) for node in node_indexes]) else: df = pd.DataFrame( data=embedding, columns=["node " + str(node) for node in node_indexes]) classifier.fit(df, labels.toarray()) # Explain the classification (regression) model explainer = shap.TreeExplainer(classifier) shap_values = explainer.shap_values(df) for i in range(labels.shape[1]): shap_plot = shap.summary_plot(shap_values[i], df, show=False, plot_size=None) plt.title("Class" + str(i)) plt.show() shap.summary_plot(shap_values, df, show=False) plt.show()
def load_scoring_model(name: Optional[str]) -> ScoringModel: # Import DB locally so that Lithops doesn't try to pickle it & fail due to psycopg2 # pylint: disable=import-outside-toplevel # circular import from sm.engine.db import DB if name is None: return MsmScoringModel() row = DB().select_one( "SELECT type, params FROM scoring_model WHERE name = %s", (name, )) assert row, f'Scoring model {name} not found' type_, params = row if type_ == 'catboost': bucket, key = split_s3_path(params['s3_path']) with TemporaryDirectory() as tmpdir: model_file = Path(tmpdir) / 'model.cbm' with model_file.open('wb') as f: f.write(get_s3_client().get_object(Bucket=bucket, Key=key)['Body'].read()) model = CatBoost() model.load_model(str(model_file), 'cbm') return CatBoostScoringModel(name, model, params) else: raise ValueError(f'Unsupported scoring model type: {type_}')
def fit_regressor(self, X, y, features=None, display=False) -> None: """ :param X: X np.array with shape (number of snippets, number of patterns) or (number of patterns, ). :param y: np.array with shape (number of snippets,), array of snippets' complexity metric values :param features: set of features to train :param display: show additional output :return: None """ model = CatBoost() grid = { 'learning_rate': [0.03, 0.1], 'depth': [4, 6, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9] } if features: X = X[features] self.features_conf = {'features_order': X.columns} model.grid_search( grid, X=X, y=y, verbose=display, ) self.model = model self.model.fit(X, y.ravel(), logging_level='Silent')
def test_multiple_eval_sets_no_empty(): cat_features = [0, 3, 2] cd_file = yatest.common.test_output_path('cd.txt') with open(cd_file, 'wt') as cd: cd.write('0\tTarget\n') for feature_no in sorted(cat_features): cd.write('{}\tCateg\n'.format(1 + feature_no)) x, y = random_xy(6, 4) train_pool = Pool(x, y, cat_features=cat_features) x0, y0 = random_xy(0, 4) # empty tuple eval set x1, y1 = random_xy(3, 4) test0_file = save_and_give_path(y0, x0, 'test0.txt') # empty file eval set try: Pool(x0, y0, cat_features=cat_features) except CatboostError: assert True else: assert False, "Do not create Pool for empty data" model = CatBoost({ 'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0 }) try: model.fit(train_pool, eval_set=[(x1, y1), (x0, y0)], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with empty tuple in multiple eval sets" try: model.fit(train_pool, eval_set=[(x1, y1), test0_file], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with empty file in multiple eval sets" try: model.fit(train_pool, eval_set=[(x1, y1), None], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with None in multiple eval sets" try: model.fit(train_pool, eval_set=[None], column_description=cd_file) except CatboostError: assert False, "Ok to have one eval set None"
def test_pairwise(): train_pool = Pool(ZEN_TRAIN_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TRAIN_PAIRS_FILE) test_pool = Pool(ZEN_TEST_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TEST_PAIRS_FILE) model = CatBoost(params={'loss_function': 'PairLogit', 'random_seed': 0, 'iterations': 2, 'thread_count': 8}) model.fit(train_pool) pred1 = model.predict(test_pool) df = read_table(ZEN_TRAIN_FILE, delimiter='\t', header=None, dtype={12: str}) train_target = df.loc[:, 1] cat_features = range(13) train_data = df.drop([0, 1, 15], axis=1).astype(str) train_pairs = read_table(ZEN_TRAIN_PAIRS_FILE, delimiter='\t', header=None) df = read_table(ZEN_TEST_FILE, delimiter='\t', header=None, dtype={12: str}) test_data = df.drop([0, 1, 15], axis=1).astype(str) model.fit(train_data, train_target, cat_features, pairs=train_pairs) pred2 = model.predict(test_data) pairs_weight = np.ones(train_pairs.shape[0]) model.fit(train_data, train_target, cat_features, pairs=train_pairs, pairs_weight=pairs_weight) pred3 = model.predict(test_data) assert _check_data(pred1, pred2) assert _check_data(pred1, pred3)
def fit_cv(self, x, y, groups, train_indices, test_indices, **fit_params): parameters = deepcopy(self.default_parameters) if fit_params is not None: parameters.update(fit_params) self.ctb = CatBoost(parameters) train_values = x.drop(['label', 'id'], axis=1) features = list(train_values.columns.values) self.categorical_features = [] for f in features: if isinstance( train_values.head(1)[f].values[0], str ) or f == 'day' or f == 'past_closest_action_involving_impression' or f == 'future_closest_action_involving_impression': self.categorical_features.append(features.index(f)) print(f + ' is categorical!') if len(self.categorical_features) == 0: self.categorical_features = None train_with_weights = Pool(data=train_values.values[train_indices, :], label=x['label'].values[train_indices], group_id=x['id'].values[train_indices], cat_features=self.categorical_features) self.ctb.fit(train_with_weights, plot=False) return self.ctb
def test_eval_metrics_batch_calcer(loss_function): metric = loss_function if loss_function == 'QueryRMSE': train, test, cd = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE metric = 'PFound' else: train, test, cd = TRAIN_FILE, TEST_FILE, CD_FILE train_pool = Pool(train, column_description=cd) test_pool = Pool(test, column_description=cd) model = CatBoost( params={ 'loss_function': loss_function, 'random_seed': 0, 'iterations': 100, 'thread_count': 8, 'eval_metric': metric }) model.fit(train_pool, eval_set=test_pool, use_best_model=False) first_metrics = np.round( np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10) calcer = model.create_metric_calcer([metric]) calcer.add(test_pool) second_metrics = np.round(calcer.eval_metrics().get_result(metric), 10) assert np.all(first_metrics == second_metrics)
class CatBoostModel(Model): """CatBoost Model""" def __init__(self, loss="RMSE", **kwargs): # There are more options if loss not in {"RMSE", "Logloss"}: raise NotImplementedError self._params = {"loss_function": loss} self._params.update(kwargs) self.model = None def fit(self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), **kwargs): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] # CatBoost needs 1D array as its label if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze( y_valid.values) else: raise ValueError("CatBoost doesn't support multi-label training") train_pool = Pool(data=x_train, label=y_train_1d) valid_pool = Pool(data=x_valid, label=y_valid_1d) # Initialize the catboost model self._params["iterations"] = num_boost_round self._params["early_stopping_rounds"] = early_stopping_rounds self._params["verbose_eval"] = verbose_eval self._params["task_type"] = "GPU" if get_gpu_device_count( ) > 0 else "CPU" self.model = CatBoost(self._params, **kwargs) # train the model self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs) evals_result = self.model.get_evals_result() evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0] def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index)
def _get_cv_model(self, tr_X, val_X, tr_y, val_y, val_idx): if self.clf_type == 'cat': clf_train = Pool(tr_X, tr_y) clf_val = Pool(val_X, val_y) clf_test = Pool(self.test_X) self.model = CatBoost(params=self.params) self.model.fit(clf_train, eval_set=[clf_val]) self.oof[val_idx] = self.model.predict(clf_val) self.preds += self.model.predict(clf_test) / self.kf.n_splits self.FIs += self.model.get_feature_importance() elif self.clf_type == 'lgb': clf_train = lgb.Dataset(tr_X, tr_y) clf_val = lgb.Dataset(val_X, val_y, reference=lgb.train) self.model = lgb.train(self.params, clf_train, valid_sets=[clf_train, clf_val], verbose_eval=self.verbose_eval) self.oof[val_idx] = self.model.predict( val_X, num_iteration=self.model.best_iteration) self.preds += self.model.predict( self.test_X, num_iteration=self.model.best_iteration) / self.kf.n_splits self.FIs += self.model.feature_importance(importance_type='gain') elif self.clf_type == 'xgb': clf_train = xgb.DMatrix(tr_X, label=tr_y, feature_names=self.columns) clf_val = xgb.DMatrix(val_X, label=val_y, feature_names=self.columns) clf_test = xgb.DMatrix(self.test_X, feature_names=self.columns) evals = [(clf_train, 'train'), (clf_val, 'eval')] evals_result = {} nround, early_stop_rounds = self._get_xgb_callbacks() self.model = xgb.train(self.params, clf_train, num_boost_round=nround, early_stopping_rounds=early_stop_rounds, verbose_eval=self.verbose_eval, evals=evals, evals_result=evals_result) self.oof[val_idx] = self.model.predict(clf_val) self.preds += self.model.predict(clf_test) / self.kf.n_splits self.FIs = self.merge_dict_add_values(self.FIs, self.model.get_fscore()) elif self.clf_type == 'sklearn': self.model = self.sk_model self.model.fit(tr_X, tr_y) self.oof[val_idx] = self.model.predict(val_X) self.preds += self.model.predict(self.test_X) / self.kf.n_splits self.FIs += self.model.feature_importances_ else: raise ValueError('clf_type is wrong.')
def create_metrics_calcer(self, metrics, thread_count, eval_step=1): if not os.path.exists(self._model_path): raise CatboostError("Model was deleted. Can't create calcer now") model = CatBoost() model.load_model(self._model_path) return model.create_metric_calcer(metrics, thread_count=thread_count, eval_period=eval_step)
def model_fit_with(train_set, test_sets, cd_file): model = CatBoost({ 'use_best_model': False, 'loss_function': 'RMSE', 'iterations': 12, 'random_seed': 0 }) model.fit(train_set, eval_set=test_sets, column_description=cd_file) return model
def test_object_importances(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost({'loss_function': 'RMSE', 'iterations': 10, 'random_seed': 0}) model.fit(train_pool) indices, scores = model.get_object_importance(pool, train_pool, top_size=10) np.savetxt(OIMP_PATH, scores) return local_canonical_file(OIMP_PATH)
def get_feature_importance(self, additional_params=None, train_pool=None): parameters = deepcopy(self.default_parameters) if additional_params is not None: parameters.update(additional_params) model = CatBoost(parameters) features_imp = model.get_feature_importance(train_pool) return features_imp
def main(): # model_path = f'{PRIVATE_DIR}ranking/model_{LOSS_FUNCTION}_{MIN_APPTS_MODEL}.cbm' # model_path = f'{PRIVATE_DIR}ranking/top_one_model_RMSE_7.cbm' model_path = f'{PRIVATE_DIR}ranking/top_one_model_QuerySoftMax_3.cbm' model = CatBoost() model.load_model(model_path) data = get_biggest_data() test_model(data, model, sort, max_sort_limit=SORT_LIMIT)
def test_save_model(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost() model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) model2 = CatBoost(model_file=OUTPUT_MODEL_PATH) pred1 = model.predict(test_pool) pred2 = model2.predict(test_pool) assert _check_data(pred1, pred2)
def test_different_cat_features_order(): dataset = np.array([[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]) labels = [1.2, 3.4, 9.5, 24.5] pool1 = Pool(dataset, labels, cat_features=[0, 1]) pool2 = Pool(dataset, labels, cat_features=[1, 0]) model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 42}) model.fit(pool1) assert (model.predict(pool1) == model.predict(pool2)).all()
def test_object_importances(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost({'loss_function': 'RMSE', 'iterations': 10, 'random_seed': 0}) model.fit(train_pool) indices, scores = model.get_object_importance(pool, train_pool, top_size=10) np.savetxt(OIMP_PATH, scores) return local_canonical_file(OIMP_PATH)
def fit(self, params, dtrain, dtest, n_estimators, seed=0): params.update({"iterations": n_estimators}) params.update({"random_seed": seed}) bst = CatBoost(params) start_time = time.time() bst.fit(dtrain, eval_set=dtest) eval_time = time.time() - start_time with open("test_error.tsv", "r") as f: results = np.array(map(lambda x: float(x.strip().split()[-1]), f.readlines()[1:])) return bst, results, eval_time
def catboost_train(X_train, y_train, X_valid, y_valid): train_pool = Pool(X_train, label=y_train) valid_pool = Pool(X_valid, label=y_valid) ctb_model = CatBoost(ctb_params) ctb_model.fit(train_pool, eval_set=[valid_pool], use_best_model=True, verbose=500) ctbm_va_pred = ctb_model.predict(X_valid) ctb_valid_score = np.sqrt(mean_squared_error(y_valid, ctbm_va_pred)) return ctb_model, ctb_valid_score
class CatBoostRanker(Ranker): def __init__(self, params): self.params = params if params['loss_function'] == 'PairLogitPairwise' and params['max_depth'] >= 8: raise Exception('max_depth for pair-logit-pairwise should be < 8') self.model = CatBoost(params) def fit(self, data): self.model.fit(X=data.train_pool) def staged_predict(self, data, eval_period): return list(self.model.staged_predict(data.test_pool, eval_period=eval_period))
def test_option_used_ram_limit(): for limit in [1000, 1234.56, 0, 0.0, 0.5, '100', '34.56', '0', '0.0', '0.5', '1.2mB', '1000b', '', None, 'none', 'inf']: CatBoost({'used_ram_limit': limit}) for limit in [-1000, 'any', '-0.5', 'nolimit', 'oo']: try: CatBoost({'used_ram_limit': limit}) assert False, "Shall not allow used_ram_limit={!r}".format(limit) except: assert True
def _predict(self, model: cb.CatBoost, pool: cb.Pool, params): pred = None if self.task.name == "multiclass": pred = model.predict(pool, prediction_type="Probability", thread_count=params["thread_count"]) elif self.task.name == "binary": pred = model.predict(pool, prediction_type="Probability", thread_count=params["thread_count"])[..., 1] elif self.task.name == "reg": pred = model.predict(pool, thread_count=params["thread_count"]) pred = self.task.losses["cb"].bw_func(pred) return pred
def fit_model(self, additional_params=None, train_pool=None, test_pool=None): parameters = deepcopy(self.default_parameters) if additional_params is not None: parameters.update(additional_params) model = CatBoost(parameters) model.fit(train_pool, eval_set=test_pool, plot=False) return model
def test_eval_metrics(loss_function): train, test, cd, metric = TRAIN_FILE, TEST_FILE, CD_FILE, loss_function if loss_function == 'QueryRMSE': train, test, cd, metric = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE, 'PFound' train_pool = Pool(train, column_description=cd) test_pool = Pool(test, column_description=cd) model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 20, 'thread_count': 8, 'eval_metric': metric}) model.fit(train_pool, eval_set=test_pool, use_best_model=False) first_metrics = np.round(np.loadtxt('./test_error.tsv', skiprows=1)[:, 1], 10) second_metrics = np.round(model.eval_metrics(test_pool, [metric])[metric][1:], 10) assert np.all(first_metrics == second_metrics)
def load(self, model_file_path): logger.debug("CatBoostLearner load model from %s" % model_file_path) ''' waiting for fix https://github.com/catboost/catboost/issues/696 Algo = CatBoostClassifier loss_function = "Logloss" if self.params["ml_task"] == MULTICLASS_CLASSIFICATION: loss_function = "MultiClass" elif self.params["ml_task"] == REGRESSION: loss_function = self.params.get("loss_function", "RMSE") Algo = CatBoostRegressor ''' self.model = CatBoost().load_model(model_file_path)
def test_python_export_from_app(dataset): _, test_pool = _get_train_test_pool(dataset) _, model_py, model_cbm = _get_cpp_py_cbm_model(dataset) model = CatBoost() model.load_model(model_cbm) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') scope = {} execfile(model_py, scope) pred_python = _predict_python(test_pool, scope['apply_catboost_model']) assert _check_data(pred_model, pred_python)
def fit( self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), reweighter=None, **kwargs ): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) if df_train.empty or df_valid.empty: raise ValueError("Empty data from dataset, please check your dataset config.") x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] # CatBoost needs 1D array as its label if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) else: raise ValueError("CatBoost doesn't support multi-label training") if reweighter is None: w_train = None w_valid = None elif isinstance(reweighter, Reweighter): w_train = reweighter.reweight(df_train).values w_valid = reweighter.reweight(df_valid).values else: raise ValueError("Unsupported reweighter type.") train_pool = Pool(data=x_train, label=y_train_1d, weight=w_train) valid_pool = Pool(data=x_valid, label=y_valid_1d, weight=w_valid) # Initialize the catboost model self._params["iterations"] = num_boost_round self._params["early_stopping_rounds"] = early_stopping_rounds self._params["verbose_eval"] = verbose_eval self._params["task_type"] = "GPU" if get_gpu_device_count() > 0 else "CPU" self.model = CatBoost(self._params, **kwargs) # train the model self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs) evals_result = self.model.get_evals_result() evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0]
def test_querywise(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost( params={ 'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 2, 'thread_count': 8 }) model.fit(train_pool) pred1 = model.predict(test_pool) df = read_table(QUERYWISE_TRAIN_FILE, delimiter='\t', header=None) train_query_id = df.loc[:, 0] train_target = df.loc[:, 1] train_data = df.drop([0, 1, 2, 3], axis=1).astype(str) df = read_table(QUERYWISE_TEST_FILE, delimiter='\t', header=None) test_data = df.drop([0, 1, 2, 3], axis=1).astype(str) model.fit(train_data, train_target, group_id=train_query_id) pred2 = model.predict(test_data) assert _check_data(pred1, pred2)
def train_and_predict(self, train, valid, test, param, colum): cat_tr = Pool(train[0], label=train[1]) cat_val = Pool(valid[0], label=valid[1]) cat_test = Pool(test) model = CatBoost(param) model.fit( cat_tr, eval_set=[cat_tr, cat_val], early_stopping_rounds=100, verbose_eval=150, ) pred = model.predict(cat_test) print(get_evals_result()) return model, pred, get_evals_result()
def test_eval_metrics(loss_function): train, test, cd, metric = TRAIN_FILE, TEST_FILE, CD_FILE, loss_function if loss_function == 'QueryRMSE': train, test, cd, metric = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE, 'PFound' if loss_function == 'Logloss': metric = 'AUC' train_pool = Pool(train, column_description=cd) test_pool = Pool(test, column_description=cd) model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 20, 'thread_count': 8, 'eval_metric': metric}) model.fit(train_pool, eval_set=test_pool, use_best_model=False) first_metrics = np.round(np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10) second_metrics = np.round(model.eval_metrics(test_pool, [metric])[metric], 10) assert np.all(first_metrics == second_metrics)
def test_export_model_with_cat_features_to_python_from_app(): test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost() with open(OUTPUT_MODEL_PATH, "w") as model_file: model_file.write(resource.find("cb_adult_model_bin")) model.load_model(OUTPUT_MODEL_PATH) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') from adult_model import apply_catboost_model as apply_catboost_model_from_app pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, test_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_app(float_features, cat_features)) assert _check_data(pred_model, pred_python)
def s3_catboost_scoring_model(test_db): name = 'test_scoring_model' features = ['chaos', 'chaos_fdr', 'mz_err_abs_fdr'] # Train a model that just predicts the chaos metric and ignores the other features dummy_X = pd.DataFrame({ 'chaos': np.linspace(0, 1, 101), 'chaos_fdr': 0, 'mz_err_abs_fdr': 0, }) model = CatBoost({ 'iterations': 10, 'feature_weights': { 0: 1, 1: 0, 2: 0 }, 'verbose': False }).fit(Pool(dummy_X, dummy_X.chaos.values)) # Upload the model to S3 params = upload_catboost_scoring_model(model, BUCKET_NAME, name, False, dummy_X) save_scoring_model_to_db(name, 'catboost', params) return name
def _predict(self, model: cb.CatBoost, pool: cb.Pool, params): pred = None if self.task.name == 'multiclass': pred = model.predict(pool, prediction_type='Probability', thread_count=params['thread_count']) elif self.task.name == 'binary': pred = model.predict(pool, prediction_type='Probability', thread_count=params['thread_count'])[..., 1] elif self.task.name == 'reg': pred = model.predict(pool, thread_count=params['thread_count']) pred = self.task.losses['cb'].bw_func(pred) return pred
def test_save_model(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost() model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) model2 = CatBoost(model_file=OUTPUT_MODEL_PATH) pred1 = model.predict(test_pool) pred2 = model2.predict(test_pool) assert _check_data(pred1, pred2)
def test_eval_set(): dataset = [(1, 2, 3, 4), (2, 2, 3, 4), (3, 2, 3, 4), (4, 2, 3, 4)] labels = [1, 2, 3, 4] train_pool = Pool(dataset, labels, cat_features=[0, 3, 2]) model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0}) eval_dataset = [(5, 6, 6, 6), (6, 6, 6, 6)] eval_labels = [5, 6] eval_pool = (eval_dataset, eval_labels) model.fit(train_pool, eval_set=eval_pool) eval_pools = [eval_pool] model.fit(train_pool, eval_set=eval_pools) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_multiple_eval_sets_no_empty(): cat_features = [0, 3, 2] cd_file = yatest.common.test_output_path('cd.txt') with open(cd_file, 'wt') as cd: cd.write('0\tTarget\n') for feature_no in sorted(cat_features): cd.write('{}\tCateg\n'.format(1 + feature_no)) x, y = random_xy(6, 4) train_pool = Pool(x, y, cat_features=cat_features) x0, y0 = random_xy(0, 4) # empty tuple eval set x1, y1 = random_xy(3, 4) test0_file = save_and_give_path(y0, x0, 'test0.txt') # empty file eval set try: Pool(x0, y0, cat_features=cat_features) except CatboostError: assert True else: assert False, "Do not create Pool for empty data" model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0}) try: model.fit(train_pool, eval_set=[(x1, y1), (x0, y0)], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with empty tuple in multiple eval sets" try: model.fit(train_pool, eval_set=[(x1, y1), test0_file], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with empty file in multiple eval sets" try: model.fit(train_pool, eval_set=[(x1, y1), None], column_description=cd_file) except CatboostError: assert True else: assert False, "Do not fit with None in multiple eval sets" try: model.fit(train_pool, eval_set=[None], column_description=cd_file) except CatboostError: assert False, "Ok to have one eval set None"
def test_fit_from_file(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'}) model.fit(train_pool) predictions1 = model.predict(train_pool) model.fit(TRAIN_FILE, column_description=CD_FILE) predictions2 = model.predict(train_pool) assert all(predictions1 == predictions2)
def _fit_model(pool, case, fold_id, model_path): from catboost import CatBoost # Learn model make_dirs_if_not_exists(FoldModelsHandler.__MODEL_DIR) feature_count = pool.num_col() if "ignored_features" in case.get_params(): ignored_features = case.get_params()["ignored_features"] if len(ignored_features) and max(ignored_features) >= feature_count: raise CatboostError("Error: input parameter contains feature indices wich are not available in pool: " "{}\n " "Check eval_feature set and ignored features options".format(ignored_features)) get_eval_logger().debug('Learn model {} on fold #{}'.format(str(case), fold_id)) cur_time = time.time() instance = CatBoost(params=case.get_params()) instance.fit(pool) instance.save_model(fname=model_path) get_eval_logger().debug('Operation was done in {} seconds'.format(time.time() - cur_time)) return FoldModel(case, model_path, fold_id)
def test_eval_metrics_batch_calcer(loss_function): metric = loss_function if loss_function == 'QueryRMSE': train, test, cd = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE metric = 'PFound' else: train, test, cd = TRAIN_FILE, TEST_FILE, CD_FILE train_pool = Pool(train, column_description=cd) test_pool = Pool(test, column_description=cd) model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 100, 'thread_count': 8, 'eval_metric': metric}) model.fit(train_pool, eval_set=test_pool, use_best_model=False) first_metrics = np.round(np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10) calcer = model.create_metric_calcer([metric]) calcer.add(test_pool) second_metrics = np.round(calcer.eval_metrics().get_result(metric), 10) assert np.all(first_metrics == second_metrics)
def test_coreml_import_export(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 20, 'thread_count': 8}) model.fit(train_pool) model.save_model(OUTPUT_COREML_MODEL_PATH, format="coreml") canon_pred = model.predict(test_pool) coreml_loaded_model = CatBoostRegressor() coreml_loaded_model.load_model(OUTPUT_COREML_MODEL_PATH, format="coreml") assert all(canon_pred == coreml_loaded_model.predict(test_pool)) return local_canonical_file(OUTPUT_COREML_MODEL_PATH)
def test_querywise(): train_pool = Pool(QUERY_TRAIN_FILE, column_description=QUERY_CD_FILE) test_pool = Pool(QUERY_TEST_FILE, column_description=QUERY_CD_FILE) model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 2, 'thread_count': 8}) model.fit(train_pool) pred1 = model.predict(test_pool) df = read_table(QUERY_TRAIN_FILE, delimiter='\t', header=None) train_query_id = df.loc[:, 0] train_target = df.loc[:, 1] train_data = df.drop([0, 1, 2, 3], axis=1).astype(str) df = read_table(QUERY_TEST_FILE, delimiter='\t', header=None) test_data = df.drop([0, 1, 2, 3], axis=1).astype(str) model.fit(train_data, train_target, query_id=train_query_id) pred2 = model.predict(test_data) assert _check_data(pred1, pred2)
def test_predict_regress(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'}) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_invalid_loss_base(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({"loss_function": "abcdef"}) model.fit(pool)
def create_metrics_calcer(self, metrics, thread_count, eval_step=1): if not os.path.exists(self._model_path): raise CatboostError("Model was deleted. Can't create calcer now") model = CatBoost() model.load_model(self._model_path) return model.create_metric_calcer(metrics, thread_count=thread_count, eval_period=eval_step)
def test_python_export_no_cat_features(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'}) model.fit(train_pool) model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
def test_python_export_with_cat_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({'iterations': 20, 'random_seed': 0}) model.fit(train_pool) model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)