예제 #1
1
 def model_fit_with(train_set, test_sets, cd_file):
     model = CatBoost({'use_best_model': False, 'loss_function': 'RMSE', 'iterations': 12, 'random_seed': 0})
     model.fit(train_set, eval_set=test_sets, column_description=cd_file)
     return model
예제 #2
0
def fit_catboost(X, y, cv=None, params: dict = None, verbose=500):

    if params is None:
        params = deepcopy(CAT_DEFAULT_PARAMS)

    if cv is None:
        cv = StratifiedKFold(n_splits=2, shuffle=True)
    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = CatBoost(params=params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf_train = Pool(x_train, y_train)
            clf_val = Pool(x_valid, y_valid)
            clf.fit(clf_train, eval_set=[clf_val])

        pred_i = clf.predict(x_valid, prediction_type='Probability')[:, 1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}')

    score = roc_auc_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score
예제 #3
0
def show_shap_summary(dataset, labels):
    # Create the embedding
    embedder = SNoRe()
    embedding = embedder.embed(dataset)
    node_indexes = embedder.selected_features

    # Create the classification (regression) model
    # We used CatBoost instead of XGBoost (used in the paper), because is simpler to setup since XGBoost has some bugs
    classifier = CatBoost(params={'loss_function': 'MultiRMSE', "iterations": 250})
    if sparse.issparse(embedding):
        df = pd.DataFrame.sparse.from_spmatrix(
            embedding, columns=["node " + str(node) for node in node_indexes])
    else:
        df = pd.DataFrame(
            data=embedding,
            columns=["node " + str(node) for node in node_indexes])
    classifier.fit(df, labels.toarray())

    # Explain the classification (regression) model
    explainer = shap.TreeExplainer(classifier)
    shap_values = explainer.shap_values(df)

    for i in range(labels.shape[1]):
        shap_plot = shap.summary_plot(shap_values[i],
                                      df,
                                      show=False,
                                      plot_size=None)
        plt.title("Class" + str(i))
        plt.show()
    shap.summary_plot(shap_values, df, show=False)
    plt.show()
예제 #4
0
def load_scoring_model(name: Optional[str]) -> ScoringModel:
    # Import DB locally so that Lithops doesn't try to pickle it & fail due to psycopg2
    # pylint: disable=import-outside-toplevel  # circular import
    from sm.engine.db import DB

    if name is None:
        return MsmScoringModel()

    row = DB().select_one(
        "SELECT type, params FROM scoring_model WHERE name = %s", (name, ))
    assert row, f'Scoring model {name} not found'
    type_, params = row

    if type_ == 'catboost':
        bucket, key = split_s3_path(params['s3_path'])
        with TemporaryDirectory() as tmpdir:
            model_file = Path(tmpdir) / 'model.cbm'
            with model_file.open('wb') as f:
                f.write(get_s3_client().get_object(Bucket=bucket,
                                                   Key=key)['Body'].read())
            model = CatBoost()
            model.load_model(str(model_file), 'cbm')

        return CatBoostScoringModel(name, model, params)
    else:
        raise ValueError(f'Unsupported scoring model type: {type_}')
예제 #5
0
    def fit_regressor(self, X, y, features=None, display=False) -> None:
        """

        :param X: X np.array with shape (number of snippets, number of patterns) or
                (number of patterns, ).
        :param y: np.array with shape (number of snippets,), array of snippets'
                complexity metric values
        :param features: set of features to train
        :param display: show additional output
        :return: None
        """
        model = CatBoost()

        grid = {
            'learning_rate': [0.03, 0.1],
            'depth': [4, 6, 10],
            'l2_leaf_reg': [1, 3, 5, 7, 9]
        }
        if features:
            X = X[features]

        self.features_conf = {'features_order': X.columns}

        model.grid_search(
            grid,
            X=X,
            y=y,
            verbose=display,
        )

        self.model = model
        self.model.fit(X, y.ravel(), logging_level='Silent')
예제 #6
0
def test_multiple_eval_sets_no_empty():
    cat_features = [0, 3, 2]
    cd_file = yatest.common.test_output_path('cd.txt')
    with open(cd_file, 'wt') as cd:
        cd.write('0\tTarget\n')
        for feature_no in sorted(cat_features):
            cd.write('{}\tCateg\n'.format(1 + feature_no))

    x, y = random_xy(6, 4)
    train_pool = Pool(x, y, cat_features=cat_features)

    x0, y0 = random_xy(0, 4)  # empty tuple eval set
    x1, y1 = random_xy(3, 4)
    test0_file = save_and_give_path(y0, x0, 'test0.txt')  # empty file eval set

    try:
        Pool(x0, y0, cat_features=cat_features)
    except CatboostError:
        assert True
    else:
        assert False, "Do not create Pool for empty data"

    model = CatBoost({
        'learning_rate': 1,
        'loss_function': 'RMSE',
        'iterations': 2,
        'random_seed': 0
    })

    try:
        model.fit(train_pool,
                  eval_set=[(x1, y1), (x0, y0)],
                  column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with empty tuple in multiple eval sets"

    try:
        model.fit(train_pool,
                  eval_set=[(x1, y1), test0_file],
                  column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with empty file in multiple eval sets"

    try:
        model.fit(train_pool,
                  eval_set=[(x1, y1), None],
                  column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with None in multiple eval sets"

    try:
        model.fit(train_pool, eval_set=[None], column_description=cd_file)
    except CatboostError:
        assert False, "Ok to have one eval set None"
예제 #7
0
파일: test.py 프로젝트: iamnik13/catboost
def test_pairwise():
    train_pool = Pool(ZEN_TRAIN_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TRAIN_PAIRS_FILE)
    test_pool = Pool(ZEN_TEST_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TEST_PAIRS_FILE)
    model = CatBoost(params={'loss_function': 'PairLogit', 'random_seed': 0, 'iterations': 2, 'thread_count': 8})
    model.fit(train_pool)
    pred1 = model.predict(test_pool)

    df = read_table(ZEN_TRAIN_FILE, delimiter='\t', header=None, dtype={12: str})
    train_target = df.loc[:, 1]
    cat_features = range(13)
    train_data = df.drop([0, 1, 15], axis=1).astype(str)
    train_pairs = read_table(ZEN_TRAIN_PAIRS_FILE, delimiter='\t', header=None)

    df = read_table(ZEN_TEST_FILE, delimiter='\t', header=None, dtype={12: str})
    test_data = df.drop([0, 1, 15], axis=1).astype(str)

    model.fit(train_data, train_target, cat_features, pairs=train_pairs)
    pred2 = model.predict(test_data)

    pairs_weight = np.ones(train_pairs.shape[0])
    model.fit(train_data, train_target, cat_features, pairs=train_pairs, pairs_weight=pairs_weight)
    pred3 = model.predict(test_data)

    assert _check_data(pred1, pred2)
    assert _check_data(pred1, pred3)
예제 #8
0
    def fit_cv(self, x, y, groups, train_indices, test_indices, **fit_params):
        parameters = deepcopy(self.default_parameters)

        if fit_params is not None:
            parameters.update(fit_params)

        self.ctb = CatBoost(parameters)

        train_values = x.drop(['label', 'id'], axis=1)

        features = list(train_values.columns.values)
        self.categorical_features = []
        for f in features:
            if isinstance(
                    train_values.head(1)[f].values[0], str
            ) or f == 'day' or f == 'past_closest_action_involving_impression' or f == 'future_closest_action_involving_impression':
                self.categorical_features.append(features.index(f))
                print(f + ' is categorical!')

        if len(self.categorical_features) == 0:
            self.categorical_features = None

        train_with_weights = Pool(data=train_values.values[train_indices, :],
                                  label=x['label'].values[train_indices],
                                  group_id=x['id'].values[train_indices],
                                  cat_features=self.categorical_features)

        self.ctb.fit(train_with_weights, plot=False)

        return self.ctb
예제 #9
0
def test_eval_metrics_batch_calcer(loss_function):
    metric = loss_function
    if loss_function == 'QueryRMSE':
        train, test, cd = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE
        metric = 'PFound'
    else:
        train, test, cd = TRAIN_FILE, TEST_FILE, CD_FILE

    train_pool = Pool(train, column_description=cd)
    test_pool = Pool(test, column_description=cd)
    model = CatBoost(
        params={
            'loss_function': loss_function,
            'random_seed': 0,
            'iterations': 100,
            'thread_count': 8,
            'eval_metric': metric
        })

    model.fit(train_pool, eval_set=test_pool, use_best_model=False)
    first_metrics = np.round(
        np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10)

    calcer = model.create_metric_calcer([metric])
    calcer.add(test_pool)

    second_metrics = np.round(calcer.eval_metrics().get_result(metric), 10)
    assert np.all(first_metrics == second_metrics)
예제 #10
0
class CatBoostModel(Model):
    """CatBoost Model"""
    def __init__(self, loss="RMSE", **kwargs):
        # There are more options
        if loss not in {"RMSE", "Logloss"}:
            raise NotImplementedError
        self._params = {"loss_function": loss}
        self._params.update(kwargs)
        self.model = None

    def fit(self,
            dataset: DatasetH,
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=20,
            evals_result=dict(),
            **kwargs):
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        # CatBoost needs 1D array as its label
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(
                y_valid.values)
        else:
            raise ValueError("CatBoost doesn't support multi-label training")

        train_pool = Pool(data=x_train, label=y_train_1d)
        valid_pool = Pool(data=x_valid, label=y_valid_1d)

        # Initialize the catboost model
        self._params["iterations"] = num_boost_round
        self._params["early_stopping_rounds"] = early_stopping_rounds
        self._params["verbose_eval"] = verbose_eval
        self._params["task_type"] = "GPU" if get_gpu_device_count(
        ) > 0 else "CPU"
        self.model = CatBoost(self._params, **kwargs)

        # train the model
        self.model.fit(train_pool,
                       eval_set=valid_pool,
                       use_best_model=True,
                       **kwargs)

        evals_result = self.model.get_evals_result()
        evals_result["train"] = list(evals_result["learn"].values())[0]
        evals_result["valid"] = list(evals_result["validation"].values())[0]

    def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
        if self.model is None:
            raise ValueError("model is not fitted yet!")
        x_test = dataset.prepare(segment,
                                 col_set="feature",
                                 data_key=DataHandlerLP.DK_I)
        return pd.Series(self.model.predict(x_test.values), index=x_test.index)
예제 #11
0
    def _get_cv_model(self, tr_X, val_X, tr_y, val_y, val_idx):

        if self.clf_type == 'cat':
            clf_train = Pool(tr_X, tr_y)
            clf_val = Pool(val_X, val_y)
            clf_test = Pool(self.test_X)
            self.model = CatBoost(params=self.params)
            self.model.fit(clf_train, eval_set=[clf_val])
            self.oof[val_idx] = self.model.predict(clf_val)
            self.preds += self.model.predict(clf_test) / self.kf.n_splits
            self.FIs += self.model.get_feature_importance()

        elif self.clf_type == 'lgb':
            clf_train = lgb.Dataset(tr_X, tr_y)
            clf_val = lgb.Dataset(val_X, val_y, reference=lgb.train)
            self.model = lgb.train(self.params,
                                   clf_train,
                                   valid_sets=[clf_train, clf_val],
                                   verbose_eval=self.verbose_eval)
            self.oof[val_idx] = self.model.predict(
                val_X, num_iteration=self.model.best_iteration)
            self.preds += self.model.predict(
                self.test_X,
                num_iteration=self.model.best_iteration) / self.kf.n_splits
            self.FIs += self.model.feature_importance(importance_type='gain')

        elif self.clf_type == 'xgb':
            clf_train = xgb.DMatrix(tr_X,
                                    label=tr_y,
                                    feature_names=self.columns)
            clf_val = xgb.DMatrix(val_X,
                                  label=val_y,
                                  feature_names=self.columns)
            clf_test = xgb.DMatrix(self.test_X, feature_names=self.columns)
            evals = [(clf_train, 'train'), (clf_val, 'eval')]
            evals_result = {}

            nround, early_stop_rounds = self._get_xgb_callbacks()
            self.model = xgb.train(self.params,
                                   clf_train,
                                   num_boost_round=nround,
                                   early_stopping_rounds=early_stop_rounds,
                                   verbose_eval=self.verbose_eval,
                                   evals=evals,
                                   evals_result=evals_result)

            self.oof[val_idx] = self.model.predict(clf_val)
            self.preds += self.model.predict(clf_test) / self.kf.n_splits
            self.FIs = self.merge_dict_add_values(self.FIs,
                                                  self.model.get_fscore())

        elif self.clf_type == 'sklearn':
            self.model = self.sk_model
            self.model.fit(tr_X, tr_y)
            self.oof[val_idx] = self.model.predict(val_X)
            self.preds += self.model.predict(self.test_X) / self.kf.n_splits
            self.FIs += self.model.feature_importances_
        else:
            raise ValueError('clf_type is wrong.')
예제 #12
0
 def create_metrics_calcer(self, metrics, thread_count, eval_step=1):
     if not os.path.exists(self._model_path):
         raise CatboostError("Model was deleted. Can't create calcer now")
     model = CatBoost()
     model.load_model(self._model_path)
     return model.create_metric_calcer(metrics,
                                       thread_count=thread_count,
                                       eval_period=eval_step)
예제 #13
0
 def model_fit_with(train_set, test_sets, cd_file):
     model = CatBoost({
         'use_best_model': False,
         'loss_function': 'RMSE',
         'iterations': 12,
         'random_seed': 0
     })
     model.fit(train_set, eval_set=test_sets, column_description=cd_file)
     return model
예제 #14
0
def test_object_importances():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool = Pool(TEST_FILE, column_description=CD_FILE)

    model = CatBoost({'loss_function': 'RMSE', 'iterations': 10, 'random_seed': 0})
    model.fit(train_pool)
    indices, scores = model.get_object_importance(pool, train_pool, top_size=10)
    np.savetxt(OIMP_PATH, scores)

    return local_canonical_file(OIMP_PATH)
예제 #15
0
    def get_feature_importance(self, additional_params=None, train_pool=None):
        parameters = deepcopy(self.default_parameters)

        if additional_params is not None:
            parameters.update(additional_params)

        model = CatBoost(parameters)
        features_imp = model.get_feature_importance(train_pool)

        return features_imp
예제 #16
0
def main():
    # model_path = f'{PRIVATE_DIR}ranking/model_{LOSS_FUNCTION}_{MIN_APPTS_MODEL}.cbm'
    # model_path = f'{PRIVATE_DIR}ranking/top_one_model_RMSE_7.cbm'
    model_path = f'{PRIVATE_DIR}ranking/top_one_model_QuerySoftMax_3.cbm'
    model = CatBoost()
    model.load_model(model_path)

    data = get_biggest_data()

    test_model(data, model, sort, max_sort_limit=SORT_LIMIT)
예제 #17
0
파일: test.py 프로젝트: shodanium/catboost
def test_save_model():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoost()
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    model2 = CatBoost(model_file=OUTPUT_MODEL_PATH)
    pred1 = model.predict(test_pool)
    pred2 = model2.predict(test_pool)
    assert _check_data(pred1, pred2)
예제 #18
0
파일: test.py 프로젝트: iamnik13/catboost
def test_different_cat_features_order():
    dataset = np.array([[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]])
    labels = [1.2, 3.4, 9.5, 24.5]

    pool1 = Pool(dataset, labels, cat_features=[0, 1])
    pool2 = Pool(dataset, labels, cat_features=[1, 0])

    model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 42})
    model.fit(pool1)
    assert (model.predict(pool1) == model.predict(pool2)).all()
예제 #19
0
def test_object_importances():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool = Pool(TEST_FILE, column_description=CD_FILE)

    model = CatBoost({'loss_function': 'RMSE', 'iterations': 10, 'random_seed': 0})
    model.fit(train_pool)
    indices, scores = model.get_object_importance(pool, train_pool, top_size=10)
    np.savetxt(OIMP_PATH, scores)

    return local_canonical_file(OIMP_PATH)
예제 #20
0
 def fit(self, params, dtrain, dtest, n_estimators, seed=0):
     params.update({"iterations": n_estimators})
     params.update({"random_seed": seed})
     bst = CatBoost(params)
     start_time = time.time()
     bst.fit(dtrain, eval_set=dtest)
     eval_time = time.time() - start_time
     with open("test_error.tsv", "r") as f:
         results = np.array(map(lambda x: float(x.strip().split()[-1]), f.readlines()[1:]))
     
     return bst, results, eval_time
예제 #21
0
def catboost_train(X_train, y_train, X_valid, y_valid):
    train_pool = Pool(X_train, label=y_train)
    valid_pool = Pool(X_valid, label=y_valid)
    ctb_model = CatBoost(ctb_params)
    ctb_model.fit(train_pool,
                  eval_set=[valid_pool],
                  use_best_model=True,
                  verbose=500)
    ctbm_va_pred = ctb_model.predict(X_valid)
    ctb_valid_score = np.sqrt(mean_squared_error(y_valid, ctbm_va_pred))
    return ctb_model, ctb_valid_score
예제 #22
0
class CatBoostRanker(Ranker):
    def __init__(self, params):
        self.params = params
        if params['loss_function'] == 'PairLogitPairwise' and params['max_depth'] >= 8:
            raise Exception('max_depth for pair-logit-pairwise should be < 8')
        self.model = CatBoost(params)

    def fit(self, data):
        self.model.fit(X=data.train_pool)

    def staged_predict(self, data, eval_period):
        return list(self.model.staged_predict(data.test_pool, eval_period=eval_period))
예제 #23
0
def test_option_used_ram_limit():
    for limit in [1000, 1234.56, 0, 0.0, 0.5,
                  '100', '34.56', '0', '0.0', '0.5',
                  '1.2mB', '1000b', '', None, 'none', 'inf']:
        CatBoost({'used_ram_limit': limit})

    for limit in [-1000, 'any', '-0.5', 'nolimit', 'oo']:
        try:
            CatBoost({'used_ram_limit': limit})
            assert False, "Shall not allow used_ram_limit={!r}".format(limit)
        except:
            assert True
예제 #24
0
    def _predict(self, model: cb.CatBoost, pool: cb.Pool, params):
        pred = None
        if self.task.name == "multiclass":
            pred = model.predict(pool, prediction_type="Probability", thread_count=params["thread_count"])
        elif self.task.name == "binary":
            pred = model.predict(pool, prediction_type="Probability", thread_count=params["thread_count"])[..., 1]
        elif self.task.name == "reg":
            pred = model.predict(pool, thread_count=params["thread_count"])

        pred = self.task.losses["cb"].bw_func(pred)

        return pred
예제 #25
0
    def fit_model(self,
                  additional_params=None,
                  train_pool=None,
                  test_pool=None):
        parameters = deepcopy(self.default_parameters)

        if additional_params is not None:
            parameters.update(additional_params)

        model = CatBoost(parameters)
        model.fit(train_pool, eval_set=test_pool, plot=False)
        return model
예제 #26
0
def test_eval_metrics(loss_function):
    train, test, cd, metric = TRAIN_FILE, TEST_FILE, CD_FILE, loss_function
    if loss_function == 'QueryRMSE':
        train, test, cd, metric = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE, 'PFound'

    train_pool = Pool(train, column_description=cd)
    test_pool = Pool(test, column_description=cd)
    model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 20, 'thread_count': 8, 'eval_metric': metric})

    model.fit(train_pool, eval_set=test_pool, use_best_model=False)
    first_metrics = np.round(np.loadtxt('./test_error.tsv', skiprows=1)[:, 1], 10)
    second_metrics = np.round(model.eval_metrics(test_pool, [metric])[metric][1:], 10)
    assert np.all(first_metrics == second_metrics)
예제 #27
0
 def load(self, model_file_path):
     logger.debug("CatBoostLearner load model from %s" % model_file_path)
     '''
     waiting for fix https://github.com/catboost/catboost/issues/696
     Algo = CatBoostClassifier
     loss_function = "Logloss"
     if self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
         loss_function = "MultiClass"
     elif self.params["ml_task"] == REGRESSION:
         loss_function = self.params.get("loss_function", "RMSE")
         Algo = CatBoostRegressor
     '''
     self.model = CatBoost().load_model(model_file_path)
예제 #28
0
def test_python_export_from_app(dataset):
    _, test_pool = _get_train_test_pool(dataset)
    _, model_py, model_cbm = _get_cpp_py_cbm_model(dataset)

    model = CatBoost()
    model.load_model(model_cbm)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')

    scope = {}
    execfile(model_py, scope)
    pred_python = _predict_python(test_pool, scope['apply_catboost_model'])

    assert _check_data(pred_model, pred_python)
예제 #29
0
    def fit(
        self,
        dataset: DatasetH,
        num_boost_round=1000,
        early_stopping_rounds=50,
        verbose_eval=20,
        evals_result=dict(),
        reweighter=None,
        **kwargs
    ):
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        if df_train.empty or df_valid.empty:
            raise ValueError("Empty data from dataset, please check your dataset config.")
        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        # CatBoost needs 1D array as its label
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
        else:
            raise ValueError("CatBoost doesn't support multi-label training")

        if reweighter is None:
            w_train = None
            w_valid = None
        elif isinstance(reweighter, Reweighter):
            w_train = reweighter.reweight(df_train).values
            w_valid = reweighter.reweight(df_valid).values
        else:
            raise ValueError("Unsupported reweighter type.")

        train_pool = Pool(data=x_train, label=y_train_1d, weight=w_train)
        valid_pool = Pool(data=x_valid, label=y_valid_1d, weight=w_valid)

        # Initialize the catboost model
        self._params["iterations"] = num_boost_round
        self._params["early_stopping_rounds"] = early_stopping_rounds
        self._params["verbose_eval"] = verbose_eval
        self._params["task_type"] = "GPU" if get_gpu_device_count() > 0 else "CPU"
        self.model = CatBoost(self._params, **kwargs)

        # train the model
        self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs)

        evals_result = self.model.get_evals_result()
        evals_result["train"] = list(evals_result["learn"].values())[0]
        evals_result["valid"] = list(evals_result["validation"].values())[0]
예제 #30
0
def test_querywise():
    train_pool = Pool(QUERYWISE_TRAIN_FILE,
                      column_description=QUERYWISE_CD_FILE)
    test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE)
    model = CatBoost(
        params={
            'loss_function': 'QueryRMSE',
            'random_seed': 0,
            'iterations': 2,
            'thread_count': 8
        })
    model.fit(train_pool)
    pred1 = model.predict(test_pool)

    df = read_table(QUERYWISE_TRAIN_FILE, delimiter='\t', header=None)
    train_query_id = df.loc[:, 0]
    train_target = df.loc[:, 1]
    train_data = df.drop([0, 1, 2, 3], axis=1).astype(str)

    df = read_table(QUERYWISE_TEST_FILE, delimiter='\t', header=None)
    test_data = df.drop([0, 1, 2, 3], axis=1).astype(str)

    model.fit(train_data, train_target, group_id=train_query_id)
    pred2 = model.predict(test_data)
    assert _check_data(pred1, pred2)
예제 #31
0
 def train_and_predict(self, train, valid, test, param, colum):
     cat_tr = Pool(train[0], label=train[1])
     cat_val = Pool(valid[0], label=valid[1])
     cat_test = Pool(test)
     model = CatBoost(param)
     model.fit(
         cat_tr,
         eval_set=[cat_tr, cat_val],
         early_stopping_rounds=100,
         verbose_eval=150,
     )
     pred = model.predict(cat_test)
     print(get_evals_result())
     return model, pred, get_evals_result()
예제 #32
0
def test_eval_metrics(loss_function):
    train, test, cd, metric = TRAIN_FILE, TEST_FILE, CD_FILE, loss_function
    if loss_function == 'QueryRMSE':
        train, test, cd, metric = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE, 'PFound'
    if loss_function == 'Logloss':
        metric = 'AUC'

    train_pool = Pool(train, column_description=cd)
    test_pool = Pool(test, column_description=cd)
    model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 20, 'thread_count': 8, 'eval_metric': metric})

    model.fit(train_pool, eval_set=test_pool, use_best_model=False)
    first_metrics = np.round(np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10)
    second_metrics = np.round(model.eval_metrics(test_pool, [metric])[metric], 10)
    assert np.all(first_metrics == second_metrics)
예제 #33
0
def test_export_model_with_cat_features_to_python_from_app():
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoost()
    with open(OUTPUT_MODEL_PATH, "w") as model_file:
        model_file.write(resource.find("cb_adult_model_bin"))
    model.load_model(OUTPUT_MODEL_PATH)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    from adult_model import apply_catboost_model as apply_catboost_model_from_app
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, test_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_app(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
예제 #34
0
def s3_catboost_scoring_model(test_db):
    name = 'test_scoring_model'
    features = ['chaos', 'chaos_fdr', 'mz_err_abs_fdr']
    # Train a model that just predicts the chaos metric and ignores the other features
    dummy_X = pd.DataFrame({
        'chaos': np.linspace(0, 1, 101),
        'chaos_fdr': 0,
        'mz_err_abs_fdr': 0,
    })
    model = CatBoost({
        'iterations': 10,
        'feature_weights': {
            0: 1,
            1: 0,
            2: 0
        },
        'verbose': False
    }).fit(Pool(dummy_X, dummy_X.chaos.values))

    # Upload the model to S3
    params = upload_catboost_scoring_model(model, BUCKET_NAME, name, False,
                                           dummy_X)
    save_scoring_model_to_db(name, 'catboost', params)

    return name
예제 #35
0
    def _predict(self, model: cb.CatBoost, pool: cb.Pool, params):
        pred = None
        if self.task.name == 'multiclass':
            pred = model.predict(pool,
                                 prediction_type='Probability',
                                 thread_count=params['thread_count'])
        elif self.task.name == 'binary':
            pred = model.predict(pool,
                                 prediction_type='Probability',
                                 thread_count=params['thread_count'])[..., 1]
        elif self.task.name == 'reg':
            pred = model.predict(pool, thread_count=params['thread_count'])

        pred = self.task.losses['cb'].bw_func(pred)

        return pred
예제 #36
0
파일: test.py 프로젝트: iamnik13/catboost
def test_save_model():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoost()
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    model2 = CatBoost(model_file=OUTPUT_MODEL_PATH)
    pred1 = model.predict(test_pool)
    pred2 = model2.predict(test_pool)
    assert _check_data(pred1, pred2)
예제 #37
0
def test_eval_set():
    dataset = [(1, 2, 3, 4), (2, 2, 3, 4), (3, 2, 3, 4), (4, 2, 3, 4)]
    labels = [1, 2, 3, 4]
    train_pool = Pool(dataset, labels, cat_features=[0, 3, 2])

    model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0})

    eval_dataset = [(5, 6, 6, 6), (6, 6, 6, 6)]
    eval_labels = [5, 6]
    eval_pool = (eval_dataset, eval_labels)

    model.fit(train_pool, eval_set=eval_pool)

    eval_pools = [eval_pool]

    model.fit(train_pool, eval_set=eval_pools)

    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
예제 #38
0
def test_multiple_eval_sets_no_empty():
    cat_features = [0, 3, 2]
    cd_file = yatest.common.test_output_path('cd.txt')
    with open(cd_file, 'wt') as cd:
        cd.write('0\tTarget\n')
        for feature_no in sorted(cat_features):
            cd.write('{}\tCateg\n'.format(1 + feature_no))

    x, y = random_xy(6, 4)
    train_pool = Pool(x, y, cat_features=cat_features)

    x0, y0 = random_xy(0, 4)  # empty tuple eval set
    x1, y1 = random_xy(3, 4)
    test0_file = save_and_give_path(y0, x0, 'test0.txt')  # empty file eval set

    try:
        Pool(x0, y0, cat_features=cat_features)
    except CatboostError:
        assert True
    else:
        assert False, "Do not create Pool for empty data"

    model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0})

    try:
        model.fit(train_pool, eval_set=[(x1, y1), (x0, y0)], column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with empty tuple in multiple eval sets"

    try:
        model.fit(train_pool, eval_set=[(x1, y1), test0_file], column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with empty file in multiple eval sets"

    try:
        model.fit(train_pool, eval_set=[(x1, y1), None], column_description=cd_file)
    except CatboostError:
        assert True
    else:
        assert False, "Do not fit with None in multiple eval sets"

    try:
        model.fit(train_pool, eval_set=[None], column_description=cd_file)
    except CatboostError:
        assert False, "Ok to have one eval set None"
예제 #39
0
def test_fit_from_file():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'})
    model.fit(train_pool)
    predictions1 = model.predict(train_pool)

    model.fit(TRAIN_FILE, column_description=CD_FILE)
    predictions2 = model.predict(train_pool)
    assert all(predictions1 == predictions2)
    def _fit_model(pool, case, fold_id, model_path):
        from catboost import CatBoost
        # Learn model
        make_dirs_if_not_exists(FoldModelsHandler.__MODEL_DIR)

        feature_count = pool.num_col()
        if "ignored_features" in case.get_params():
            ignored_features = case.get_params()["ignored_features"]
            if len(ignored_features) and max(ignored_features) >= feature_count:
                raise CatboostError("Error: input parameter contains feature indices wich are not available in pool: "
                                    "{}\n "
                                    "Check eval_feature set and ignored features options".format(ignored_features))
        get_eval_logger().debug('Learn model {} on fold #{}'.format(str(case), fold_id))
        cur_time = time.time()
        instance = CatBoost(params=case.get_params())
        instance.fit(pool)
        instance.save_model(fname=model_path)

        get_eval_logger().debug('Operation was done in {} seconds'.format(time.time() - cur_time))
        return FoldModel(case, model_path, fold_id)
예제 #41
0
def test_eval_metrics_batch_calcer(loss_function):
    metric = loss_function
    if loss_function == 'QueryRMSE':
        train, test, cd = QUERYWISE_TRAIN_FILE, QUERYWISE_TEST_FILE, QUERYWISE_CD_FILE
        metric = 'PFound'
    else:
        train, test, cd = TRAIN_FILE, TEST_FILE, CD_FILE

    train_pool = Pool(train, column_description=cd)
    test_pool = Pool(test, column_description=cd)
    model = CatBoost(params={'loss_function': loss_function, 'random_seed': 0, 'iterations': 100, 'thread_count': 8, 'eval_metric': metric})

    model.fit(train_pool, eval_set=test_pool, use_best_model=False)
    first_metrics = np.round(np.loadtxt('catboost_info/test_error.tsv', skiprows=1)[:, 1], 10)

    calcer = model.create_metric_calcer([metric])
    calcer.add(test_pool)

    second_metrics = np.round(calcer.eval_metrics().get_result(metric), 10)
    assert np.all(first_metrics == second_metrics)
예제 #42
0
def test_coreml_import_export():
    train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE)
    test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE)
    model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 20, 'thread_count': 8})
    model.fit(train_pool)
    model.save_model(OUTPUT_COREML_MODEL_PATH, format="coreml")
    canon_pred = model.predict(test_pool)
    coreml_loaded_model = CatBoostRegressor()
    coreml_loaded_model.load_model(OUTPUT_COREML_MODEL_PATH, format="coreml")
    assert all(canon_pred == coreml_loaded_model.predict(test_pool))
    return local_canonical_file(OUTPUT_COREML_MODEL_PATH)
예제 #43
0
파일: test.py 프로젝트: iamnik13/catboost
def test_querywise():
    train_pool = Pool(QUERY_TRAIN_FILE, column_description=QUERY_CD_FILE)
    test_pool = Pool(QUERY_TEST_FILE, column_description=QUERY_CD_FILE)
    model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 2, 'thread_count': 8})
    model.fit(train_pool)
    pred1 = model.predict(test_pool)

    df = read_table(QUERY_TRAIN_FILE, delimiter='\t', header=None)
    train_query_id = df.loc[:, 0]
    train_target = df.loc[:, 1]
    train_data = df.drop([0, 1, 2, 3], axis=1).astype(str)

    df = read_table(QUERY_TEST_FILE, delimiter='\t', header=None)
    test_data = df.drop([0, 1, 2, 3], axis=1).astype(str)

    model.fit(train_data, train_target, query_id=train_query_id)
    pred2 = model.predict(test_data)
    assert _check_data(pred1, pred2)
예제 #44
0
파일: test.py 프로젝트: iamnik13/catboost
def test_predict_regress():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'})
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #45
0
파일: test.py 프로젝트: iamnik13/catboost
def test_invalid_loss_base():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoost({"loss_function": "abcdef"})
        model.fit(pool)
예제 #46
0
 def create_metrics_calcer(self, metrics, thread_count, eval_step=1):
     if not os.path.exists(self._model_path):
         raise CatboostError("Model was deleted. Can't create calcer now")
     model = CatBoost()
     model.load_model(self._model_path)
     return model.create_metric_calcer(metrics, thread_count=thread_count, eval_period=eval_step)
예제 #47
0
def test_python_export_no_cat_features():
    train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE)
    model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'})
    model.fit(train_pool)
    model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python")
    return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
예제 #48
0
def test_python_export_with_cat_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoost({'iterations': 20, 'random_seed': 0})
    model.fit(train_pool)
    model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python")
    return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)