Esempio n. 1
0
    def generate_submit(self, num_boost_round=None, from_model_saved=False):
        assert num_boost_round is not None

        if not from_model_saved:
            dtrain = self.get_train_set(as_cgb_pool=True)

            booster = cgb.train(
                dtrain=dtrain,
                params=self.params_best_fit,
                num_boost_round=num_boost_round)

            self.save_model(booster)

        else:
            booster = cgb.CatBoost(model_file=from_model_saved)

        dftest = self.get_test_set(as_cgb_pool=True)

        with Timer("Predicting"):
            probas = booster.predict(dftest, prediction_type="Probability")
            dfpred = pd.DataFrame(probas)[[1]]  # Get proba classe one
            dfpred = dfpred.rename(columns={1: 'target'})

        now = pd.Timestamp.now(tz='CET').strftime("%d-%Hh-%Mm")

        fpath = RESULT_DIR / "catboost_submit_{}.csv".format(now)

        with Timer('Storing in {}'.format(fpath)):
            dfpred.to_csv(fpath, index=False)
Esempio n. 2
0
 def load(cls, path):
     try:
         model = catboost.CatBoost().load_model(
             os.path.join(path, 'cat_ranking.cbm'))
         return cls(model=model)
     except NameError:  # CatBoost is unavailable. Try to load Python model.
         pass
Esempio n. 3
0
def main():
    # load dataset
    df = pd.read_csv(DATA_PATH)

    target = 'ACTION'
    features = [column for column in df.columns if column != target]

    # fit model
    model = catboost.CatBoost({
        'loss_function': 'Logloss',
        'verbose': False,
        'random_seed': 0
    })
    model.fit(df[features], df[target])
    model.save_model(MODEL_PATH)

    # predict on sample
    df_sample = pd.read_csv(DATA_SAMPLE_PATH)
    predicts = model.predict(df_sample)
    predicts = np.power(1 + np.exp(-predicts), -1)
    pd.DataFrame({
        'x': predicts
    }).to_csv(DATA_SAMPLE_PREDICT_PATH, index=False, header=False)

    # predict on one sample
    print('Parameters:')
    r = df_sample[:1].to_dict('records')
    for k, v in r[0].items():
        print(f'input.put("{k}", {v}.0);')

    print('Expected predict:')
    print(np.power(1 + np.exp(-model.predict(df_sample[:1])[0]), -1))
Esempio n. 4
0
 def fit(self,
         train_df,
         valid_df=None,
         train_dir='.',
         niter=10000,
         seed=123):
     if self.model is None:
         params = {
             'loss_function': 'RMSE',
             'task_type': 'GPU',
             'iterations': niter,
             'verbose': True,
             'train_dir': train_dir,
             'random_seed': seed
         }
         self.model = catboost.CatBoost(params)
         init_model = None
     else:
         init_model = self.model
     train_features, train_labels = get_feature_label(train_df)
     train_pool = catboost.Pool(data=train_features, label=train_labels)
     if valid_df is not None:
         valid_features, valid_labels = get_feature_label(valid_df)
         dev_pool = catboost.Pool(data=valid_features, label=valid_labels)
     else:
         dev_pool = None
     self.model.fit(train_pool, eval_set=dev_pool, init_model=init_model)
Esempio n. 5
0
    def fit_predict_single_fold(
            self, train: TabularDataset,
            valid: TabularDataset) -> Tuple[cb.CatBoost, np.ndarray]:
        """Implements training and prediction on single fold.

        Args:
            train: Train Dataset.
            valid: Validation Dataset.

        Returns:
            Tuple (model, predicted_values).

        """
        params, num_trees, early_stopping_rounds, fobj, feval = self._infer_params(
        )

        cb_train = self._get_pool(train)
        cb_valid = self._get_pool(valid)

        model = cb.CatBoost({
            **params,
            **{
                'num_trees': num_trees,
                'objective': fobj,
                'eval_metric': feval,
                "od_wait": early_stopping_rounds
            }
        })

        model.fit(cb_train, eval_set=cb_valid)

        val_pred = self._predict(model, cb_valid, params)

        return model, val_pred
Esempio n. 6
0
 def state_set(self, state, trusted=True):
     super(CatBoostModel, self).state_set(state['substate'])
     data = base64.decodebytes(state['tree_state'].encode('ascii'))
     filename = tempfile.mktemp()
     with open(filename, 'wb') as f:
         f.write(data)
     self.booster = catboost.CatBoost().load_model(fname=filename)
Esempio n. 7
0
    def _fit(self, tunable_params):
        params = Learner._fit(self, tunable_params)
        if 'nthread' in params:
            params['thread_count'] = params['nthread']
            del params['nthread']

        self.model = cat.CatBoost(params)
        self.model.fit(self.train, eval_set=self.test, verbose_eval=True)
Esempio n. 8
0
 def fit(self, data, args):
     dtrain = cat.Pool(data.X_train, data.y_train)
     params = self.configure(data, args)
     params["iterations"] = args.ntrees
     self.model = cat.CatBoost(params)
     with Timer() as t:
         self.model.fit(dtrain)
     return t.interval
Esempio n. 9
0
 def load(self, filename: str) -> None:
     if self._is_lightgbm:
         self._model = lgb.Booster(model_file=filename)
     elif self._is_xgboost:
         self._model = xgb.Booster()
         self._model.load_model(filename)
     elif self._is_catboost:
         self._model = cat.CatBoost()
         self._model.load_model(filename)
Esempio n. 10
0
def full_predict(df, model_params, general_params):
    pool = catboost.Pool(df["text"].values, text_features=[0])

    model_params = copy.deepcopy(model_params)
    model_params.update({"train_dir": general_params["logdir"]})

    model = catboost.CatBoost(model_params)
    model.load_model(os.path.join(general_params["logdir"], "model.cbm"))

    return model.predict(pool, prediction_type="Probability")
Esempio n. 11
0
    def train(self,
              x_train,
              y_train,
              x_cross=None,
              y_cross=None,
              cat_features=None):
        """
        x_cross or y_cross is None
        -> model train limted num_rounds
        
        x_cross and y_cross is Not None
        -> model train using validation set
        """
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]

        if x_cross is None:
            dtrain = cat.Pool(x_train, y_train, cat_features=cat_features)
            train_round = self.best_round
            if self.best_round == 0:
                train_round = self.num_rounds

            self.clf = cat.CatBoost(params=self.param)
            self.clf.fit(dtrain, verbose_eval=self.verbose_eval)
            del dtrain
        else:
            dtrain = cat.Pool(x_train, y_train, cat_features=cat_features)
            dvalid = cat.Pool(x_cross, y_cross, cat_features=cat_features)

            self.clf = cat.CatBoost(params=self.param)
            self.clf.fit(dtrain,
                         eval_set=[dvalid],
                         early_stopping_rounds=self.early_stopping,
                         verbose_eval=self.verbose_eval)
            self.best_round = max(self.best_round, self.clf.best_iteration_)
            del dtrain, dvalid

        gc.collect()
Esempio n. 12
0
def cat_single(
    train,
    test,
    target_name,
    features,
    params,
    metric,
    other_params,
    fold_num=None,
    plot_importance=True,
    print_result=True,
    predict_train=True,
):

    dtrain, dval, X_train, X_val = get_data_set(
        train, target_name, features, other_params["cat_features"],other_params["weight"],other_params ,fold_num
    )
    dtest = cat.Pool(
        data=test[features],
        feature_names=features,
        cat_features=other_params["cat_features"],
    )

    model = cat.CatBoost(params)
    model.fit(dtrain, eval_set=dval, use_best_model=True)
    train_pred = (
        model.predict(dtrain, other_params["prediction_type"])
        if predict_train
        else None
    )
    val_pred = model.predict(dval, prediction_type=other_params["prediction_type"])
    test_pred = model.predict(dtest, prediction_type=other_params["prediction_type"])
    if ((other_params["prediction_type"]=="Probability")& (other_params["num_class"]==1)) : 
        if predict_train : 
            train_pred=train_pred[:,1]
        val_pred=val_pred[:,1]
        test_pred=test_pred[:,1]
        
    val_score = metric(X_val[target_name], val_pred)
    train_score = metric(X_train[target_name], train_pred) if predict_train else -1

    if print_result:
        print(
            "final train score : {} -validation score: {}".format(
                str(round(train_score, 5)), str(round(val_score, 5))
            )
        )
    importances = [model.get_feature_importance(prettified=True)]

    if plot_importance:
        _plot_importance_(importances, features)
    return train_pred, val_pred, test_pred, train_score, val_score, importances, model
Esempio n. 13
0
    def __init__(self, config):
        self.log = logging.getLogger("CatboostModel")
        self.log.info("model config: {0}".format(config))
        self.config = config
        catboost_parameters = {
            'iterations': config["iterations"],
            'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=5'],
            'random_seed': 0,
            'loss_function': config["loss_function"]
        }

        self.model = cb.CatBoost(catboost_parameters)
        self.log.info("inited")
    def __init__(self, **kwargs):

        self.params = {}
        self.is_fitted = False
        for key, value in kwargs.items():
            self.params[key] = value
        self.model = cb.CatBoost(self.params)

        if self.params['objective'] in [
                'auc', 'binary_logloss', 'multi_logloss'
        ]:
            self.get_best_metric = max
        else:
            self.get_best_metric = min
def main(readcsv=pd_read_csv, method='defaultDense'):
    # Path to data
    train_file = "./data/batch/df_classification_train.csv"
    test_file = "./data/batch/df_classification_test.csv"

    # Data reading
    X_train = readcsv(train_file, range(3), t=np.float32)
    y_train = readcsv(train_file, range(3, 4), t=np.float32)
    X_test = readcsv(test_file, range(3), t=np.float32)
    y_test = readcsv(test_file, range(3, 4), t=np.float32)

    # Datasets creation
    cb_train = cb.Pool(X_train, label=np.array(y_train))
    cb_test = cb.Pool(X_test, label=np.array(y_test))

    # training parameters setting
    params = {
        'reg_lambda': 1,
        'max_depth': 8,
        'num_leaves': 2**8,
        'verbose': 0,
        'objective': 'MultiClass',
        'learning_rate': 0.3,
        'n_estimators': 100,
        'classes_count': 5,
    }

    # Training
    cb_model = cb.CatBoost(params)
    cb_model.fit(cb_train)

    # Catboost prediction
    cb_prediction = cb_model.predict(cb_test, prediction_type='Class').T[0]
    cb_errors_count = np.count_nonzero(cb_prediction - np.ravel(y_test))

    # Conversion to daal4py
    daal_model = d4p.get_gbt_model_from_catboost(cb_model)

    # daal4py prediction
    daal_predict_algo = d4p.gbt_classification_prediction(
        nClasses=params['classes_count'],
        resultsToEvaluate="computeClassLabels",
        fptype='float')
    daal_prediction = daal_predict_algo.compute(X_test, daal_model)
    daal_errors_count = np.count_nonzero(daal_prediction.prediction - y_test)
    assert np.absolute(cb_errors_count - daal_errors_count) == 0

    return (cb_prediction, cb_errors_count,
            np.ravel(daal_prediction.prediction), daal_errors_count,
            np.ravel(y_test))
Esempio n. 16
0
def gen_test(fnm, dataset, iterations=10, learning_rate=0.1, loss="RMSE"):
    features, labels = dataset()
    model = cb.CatBoost({
        "learning_rate": learning_rate,
        "iterations": iterations,
        "loss_function": loss
    })
    model.fit(features, y=labels)
    model.save_model(fnm + "-model.json", format="json")
    width = features.shape[1]
    x = np.random.rand(10, width)
    y = model.predict(x)
    with open(fnm + ".json", "wt") as f:
        json.dump({"x": x.tolist(), "y": y.tolist()}, f)
Esempio n. 17
0
    def __init__(self,
                 products: pd.DataFrame,
                 params_rec: dict,
                 params_catboost: dict,
                 catboost_features=CB_FEATURES):
        self.ranker = catboost.CatBoost(params_catboost)
        self._catboost_features = catboost_features
        self._nan_fill_dict = dict()

        self.recommender = CosineRecommender(**params_rec)
        self._product_idx = dict(zip(products.product_id,
                                     range(len(products))))

        self._idx_product = products.product_id.tolist()
        self._product_features = {
            row['product_id']: dict(row.drop(index='product_id'))
            for (i, row) in products.iterrows()
        }
Esempio n. 18
0
    def fit(self, params, dtrain, dtest, max_n_estimators, n_estimators=None, early_stopping=False, seed=0):
        if self.gpu_id:
            params.update({"task_type": 'GPU'})
            params.update({'devices': self.gpu_id})
        if early_stopping:
            params.update({"od_wait": 100})
        params.update({"iterations": n_estimators if n_estimators else max_n_estimators})
        params.update({"random_seed": seed})
        bst = cb.CatBoost(params)
        bst.fit(dtrain, eval_set=dtest)

        results = bst.evals_result_['validation_0']['RMSE'] if self.learning_task == 'regression' \
                  else bst.evals_result_['validation_0']['Logloss']

        return (
            bst.best_score_['validation_0']['RMSE'] if early_stopping else results[-1],
            bst.best_iteration_ if early_stopping else (len(results) - 1),
            results
        )
Esempio n. 19
0
 def train(
     self,
     x_train: Union[pd.DataFrame, np.ndarray],
     y_train: Union[pd.DataFrame, np.ndarray],
     x_valid: Union[pd.DataFrame, np.ndarray],
     y_valid: Union[pd.DataFrame, np.ndarray],
     cat_features: Optional[List[Union[str, int]]] = None,
     feature_names: Optional[List[str]] = None
 ) -> None:
     if not isinstance(x_train, pd.DataFrame) and feature_names is None:
         raise ValueError('Feature names are not specified. Use pd.DataFrame for inputs or pass feature_names')
     if self._is_lightgbm:
         train_data = lgb.Dataset(x_train, label=y_train)
         valid_data = lgb.Dataset(x_valid, label=y_valid)
         self._model = lgb.train(
             self._params_as_params(self._params),
             train_data,
             valid_names=['train', 'valid'],
             valid_sets=[train_data, valid_data],
             **self._params_as_kwargs(self._params)
         )
     elif self._is_xgboost:
         feature_names = feature_names or x_train.columns
         train_data = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names)
         valid_data = xgb.DMatrix(x_valid, label=y_valid, feature_names=feature_names)
         self._model = xgb.train(
             self._params_as_params(self._params),
             train_data,
             evals=[(train_data, 'train'), (valid_data, 'valid')],
             **self._params_as_kwargs(self._params)
         )
     elif self._is_catboost:
         train_data = cat.Pool(x_train, y_train, cat_features=cat_features)
         valid_data = cat.Pool(x_valid, y_valid, cat_features=cat_features)
         self._model = cat.CatBoost(
             self._params_as_params(self._params),
         )
         self._model.fit(
             train_data,
             eval_set=valid_data,
             use_best_model=True,
             **self._params_as_kwargs(self._params)
         )
Esempio n. 20
0
 def fit(self,
         train_df,
         step_sample_num=204800,
         group_size=40,
         niter=5000,
         train_dir='.',
         seed=123):
     if self.model is not None:
         init_model = self.model
     else:
         init_model = None
     params = {
         'loss_function': 'YetiRank',
         'task_type': 'GPU',
         'iterations': niter,
         'verbose': True,
         'train_dir': train_dir,
         'random_seed': seed
     }
     num_fit_calls = 1
     step_sample_num = min(step_sample_num, len(train_df) * 5)
     self.model = catboost.CatBoost(params)
     features, thrpt = get_feature_label(train_df)
     sampler = CatBoostPoolIndicesGenerator(thrpt,
                                            sample_num=step_sample_num,
                                            group_size=group_size)
     for i in range(num_fit_calls):
         indices = sampler()
         step_features = np.take(features, indices, axis=0)
         step_thrpt = np.take(thrpt, indices, axis=0)
         if self.normalize_relevance:
             step_thrpt = step_thrpt / (
                 np.max(step_thrpt, axis=-1, keepdims=True) + 1E-6)
         step_groups = np.broadcast_to(
             np.arange(step_thrpt.shape[0]).reshape((-1, 1)),
             step_thrpt.shape)
         train_pool = catboost.Pool(data=step_features.reshape(
             (-1, step_features.shape[-1])),
                                    label=step_thrpt.reshape((-1, )),
                                    group_id=step_groups.reshape((-1, )))
         self.model.fit(train_pool, init_model=init_model)
Esempio n. 21
0
    def train(self, X_train, y_train, X_valid=None, y_valid=None, params=None, **kwargs):

        train_data = cat.Pool(X_train, y_train, cat_features=None)

        watchlist = [train_data]

        if X_valid is not None and y_valid is not None:
            valid_data = cat.Pool(X_valid, y_valid, cat_features=None)
            watchlist.append(valid_data)
            stopping_rounds = 10
        else:
            stopping_rounds = None

        self.mod = cat.CatBoost(params=params)
        self.mod.fit(X=train_data,
                     eval_set=watchlist,
                     early_stopping_rounds=stopping_rounds,
                     verbose=False,
                     **kwargs)

        return self.mod
Esempio n. 22
0
    def load_models(self, model_path):
        """Load models."""
        valid_net_file = '{}/valid_model.cbm'.format(model_path)
        rank_net_cbm = '{}/list_rank_net.cbm'.format(model_path)
        rank_net_py = '{}/list_rank_net.py'.format(model_path)
        self.path = model_path
        self.is_cbm_model = True

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            if os.path.exists(valid_net_file):
                self.valid_net = catboost.CatBoostClassifier().load_model(valid_net_file)
            else:
                self.valid_net = None

            try:
                self.rank_net = catboost.CatBoost().load_model(rank_net_cbm)
            except NameError: # CatBoost is unavailable. Try to load Python model.
                self.is_cbm_model = False
                with open(rank_net_py, 'rb') as filep:
                    self.rank_net = imp.load_module(
                        model_path.replace('/', '_').replace('.', '_'), filep,
                        'list_rank_net.py', ('.py', 'rb', imp.PY_SOURCE))
 def eval(self, params, train_file, test_file, seed=0, train_query_fname=None, test_query_fname=None,\
     early_stopping_rounds=None, num_rounds=None):
     prefix = "libsvm://"
     train_group_id = self._get_group_id_from_file(train_query_fname)
     test_group_id = self._get_group_id_from_file(test_query_fname)
     train_data = cat.Pool(prefix + train_file,
                           column_description=self.column_description_fname)
     test_data = cat.Pool(prefix + test_file,
                          column_description=self.column_description_fname)
     if train_group_id is not None:
         assert test_group_id is not None
         train_data.set_group_id(train_group_id)
         test_data.set_group_id(test_group_id)
     self.fullfill_parameters(params, seed)
     if num_rounds is not None:
         params["iterations"] = num_rounds
     print("eval with params " + str(params))
     cat_booster = cat.CatBoost(params=params)
     try:
         cat_booster.fit(train_data,
                         eval_set=[test_data],
                         verbose=1,
                         early_stopping_rounds=early_stopping_rounds)
     except Exception as err:
         print("error message: ", err)
     eval_results = cat_booster.get_evals_result()
     results = eval_results["validation"][self.metric]
     if num_rounds is not None and len(results) < num_rounds:
         eval_result_list = []
         for result in results:
             eval_result_list += [result]
         for _ in range(len(results), num_rounds):
             eval_result_list += [results[-1]]
         results = np.array(eval_result_list)
     train_data = None
     test_data = None
     return cat_booster, results
    y = pd.Series(iris.target)
    return X, y


def read_yaml(path):
    with open(path, "r") as f:
        return yaml.safe_load(f)


MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10}


@pytest.fixture(
    scope="module",
    params=[
        cb.CatBoost(MODEL_PARAMS),
        cb.CatBoostClassifier(**MODEL_PARAMS),
        cb.CatBoostRegressor(**MODEL_PARAMS),
    ],
    ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"],
)
def cb_model(request):
    model = request.param
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)


@pytest.fixture
def reg_model():
    model = cb.CatBoostRegressor(**MODEL_PARAMS)
    X, y = get_iris()
Esempio n. 25
0
def train_ranking_catboost(args, train_df, test_df):
    import catboost
    params = {
        'loss_function': args.rank_loss_function,
        'custom_metric': ['NDCG', 'AverageGain:top=10'],
        'task_type': 'GPU',
        'iterations': args.niter,
        'verbose': True,
        'train_dir': args.out_dir,
        'random_seed': args.seed
    }
    train_dev_features, train_dev_labels = get_feature_label(train_df)
    test_features, test_labels = get_feature_label(test_df)

    # Split Train/Dev
    shuffle_idx = np.random.permutation(train_dev_features.shape[0])
    train_dev_features, train_dev_labels = \
        train_dev_features[shuffle_idx], train_dev_labels[shuffle_idx]
    num_train = train_dev_features.shape[0] - int(
        args.dev_ratio * train_dev_features.shape[0])
    train_features, train_labels = train_dev_features[:
                                                      num_train], train_dev_labels[:
                                                                                   num_train]
    dev_features, dev_labels = train_dev_features[
        num_train:], train_dev_labels[num_train:]

    total_data_size = len(train_df) + len(test_df)
    get_sample_size = lambda ratio: \
        int(min(total_data_size, args.max_data_size) * ratio * args.sample_mult)
    dev_sample_size = get_sample_size(args.dev_ratio * (1 - args.test_ratio))
    train_sample_size = get_sample_size(
        (1 - args.dev_ratio) * (1 - args.test_ratio))
    test_sample_size = get_sample_size(args.test_ratio)

    # Generate the training/testing samples for ranking.
    # We divide the samples into multiple bins and will do stratified sampling within each bin.
    sorted_train_ids = np.argsort(train_labels)
    train_group_ids_list = np.array_split(sorted_train_ids,
                                          args.num_threshold_bins)

    sorted_dev_ids = np.argsort(dev_labels)
    dev_group_ids_list = np.array_split(sorted_dev_ids,
                                        args.num_threshold_bins)

    sorted_test_ids = np.argsort(test_labels)
    test_group_ids_list = np.array_split(sorted_test_ids,
                                         args.num_threshold_bins)

    train_rank_features = []
    train_rank_labels = []
    train_groups = []

    dev_rank_features = []
    dev_rank_labels = []
    dev_groups = []

    test_rank_features = []
    test_rank_labels = []
    test_groups = []

    train_npz_file = os.path.join(args.out_dir, 'train_rank_features.npz')
    dev_npz_file = os.path.join(args.out_dir, 'dev_rank_features.npz')
    test_npz_file = os.path.join(args.out_dir, 'test_rank_features.npz')
    if os.path.exists(test_npz_file):
        print('Loading features from npz')
        assert os.path.exists(train_npz_file)
        assert os.path.exists(dev_npz_file)

        npzfile = np.load(train_npz_file)
        train_rank_features = npzfile['train_rank_features']
        train_rank_labels = npzfile['train_rank_labels']
        train_groups = npzfile['train_groups']

        npzfile = np.load(dev_npz_file)
        dev_rank_features = npzfile['dev_rank_features']
        dev_rank_labels = npzfile['dev_rank_labels']
        dev_groups = npzfile['dev_groups']

        npzfile = np.load(test_npz_file)
        test_rank_features = npzfile['test_rank_features']
        test_rank_labels = npzfile['test_rank_labels']
        test_groups = npzfile['test_groups']
    else:

        print('Generate Dev Ranking Groups:')
        for i in tqdm.tqdm(range(dev_sample_size)):
            for group_ids in dev_group_ids_list:
                chosen_ids = np.random.choice(group_ids,
                                              args.group_size //
                                              args.num_threshold_bins,
                                              replace=False)
                dev_rank_features.append(dev_features[chosen_ids, :])
                dev_rank_labels.append(dev_labels[chosen_ids])
                dev_groups.append(np.ones_like(chosen_ids) * i)
        dev_rank_features = np.concatenate(dev_rank_features, axis=0)
        dev_rank_labels = np.concatenate(dev_rank_labels, axis=0)
        dev_groups = np.concatenate(dev_groups, axis=0)

        np.savez(os.path.join(args.out_dir, 'dev_rank_features.npz'),
                 dev_rank_features=dev_rank_features,
                 dev_rank_labels=dev_rank_labels,
                 dev_groups=dev_groups)

        print('Generate Train Ranking Groups:')
        for i in tqdm.tqdm(range(train_sample_size)):
            for group_ids in train_group_ids_list:
                chosen_ids = np.random.choice(group_ids,
                                              args.group_size //
                                              args.num_threshold_bins,
                                              replace=False)
                train_rank_features.append(train_features[chosen_ids, :])
                train_rank_labels.append(train_labels[chosen_ids])
                train_groups.append(np.ones_like(chosen_ids) * i)
        train_rank_features = np.concatenate(train_rank_features, axis=0)
        train_rank_labels = np.concatenate(train_rank_labels, axis=0)
        train_groups = np.concatenate(train_groups, axis=0)

        np.savez(os.path.join(args.out_dir, 'train_rank_features.npz'),
                 train_rank_features=train_rank_features,
                 train_rank_labels=train_rank_labels,
                 train_groups=train_groups)

        test_rng = np.random.RandomState(args.test_seed)
        print('Generate Test Ranking Groups:')
        for i in tqdm.tqdm(range(test_sample_size)):
            for group_ids in test_group_ids_list:
                chosen_ids = test_rng.choice(group_ids,
                                             args.group_size //
                                             args.num_threshold_bins,
                                             replace=False)
                test_rank_features.append(test_features[chosen_ids, :])
                test_rank_labels.append(test_labels[chosen_ids])
                test_groups.append(np.ones_like(chosen_ids) * i)
        test_rank_features = np.concatenate(test_rank_features, axis=0)
        test_rank_labels = np.concatenate(test_rank_labels, axis=0)
        test_groups = np.concatenate(test_groups, axis=0)

        np.savez(os.path.join(args.out_dir, 'test_rank_features.npz'),
                 test_rank_features=test_rank_features,
                 test_rank_labels=test_rank_labels,
                 test_groups=test_groups)

    train_pool = catboost.Pool(data=train_rank_features,
                               label=train_rank_labels,
                               group_id=train_groups)
    dev_pool = catboost.Pool(data=dev_rank_features,
                             label=dev_rank_labels,
                             group_id=dev_groups)
    # test_pool = catboost.Pool(data=test_rank_features,
    #                           label=test_rank_labels,
    #                           group_id=test_groups)
    model = catboost.CatBoost(params)
    model.fit(train_pool, eval_set=dev_pool)
    predict_result = model.predict(test_rank_features)

    test_gt_scores = test_rank_labels.reshape(test_sample_size,
                                              args.group_size)
    predict_result = predict_result.reshape(
        (test_sample_size, args.group_size))
    np.save(os.path.join(args.out_dir, 'test_predictions.npy'), predict_result)
    test_ndcg_score = ndcg_score(y_true=test_gt_scores, y_score=predict_result)
    logging.info('Test NDCG=%f', test_ndcg_score)
    model.save_model(os.path.join(args.out_dir, 'list_rank_net.cbm'))
    model.save_model(os.path.join(args.out_dir, 'list_rank_net'),
                     format='python')
Esempio n. 26
0
 def _fit(self, tunable_params):
     params = Learner._fit(self, tunable_params)
     self.model = cat.CatBoost(params)
     self.model.fit(self.train, eval_set=self.test, verbose_eval=True)
if __name__ == "__main__":

    def load_part(part_id):
        raw_samples = pickle.load(
            open(
                "../tmp/features/candidates_train_{:02d}.pickled".format(
                    part_id), "rb"))
        return [limit_candidates_number(s, limit=1000) for s in raw_samples]

    part12 = load_part(12)
    part13 = load_part(13)

    train_pool = build_pool(part12[:20000] + part13)
    valid_pool = build_pool(part12[20000:])

    parameters = {
        "iterations": 200,
        "learning_rate": 0.05,
        "depth": 4,
        "loss_function": "PairLogit",
        "custom_metric": ["MAP:top=30", "RecallAt:top=250"],
        "verbose": False,
        "random_seed": 44,
        "task_type": "GPU",
        "max_ctr_complexity": 0,
    }

    model = catboost.CatBoost(parameters)
    model.fit(train_pool, eval_set=valid_pool, plot=True)
    model.save_model("../tmp/model_v4.3_candidates.cbm")
Esempio n. 28
0
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 2018
}
#K折交叉验证
train_preds = np.zeros(train_feat.shape[0])
test_preds = np.zeros((test_feat.shape[0], 5))
kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520)  #5折交叉验证
test = catboost.Pool(test_feat[predictors], cat_features=[20])
for i, (train_index, test_index) in enumerate(kf):
    print('第 {} 次训练...'.format(i))
    train_feat1 = train_feat.iloc[train_index]  #训练
    train_feat2 = train_feat.iloc[test_index]  #验证
    train = catboost.Pool(train_feat1[predictors],
                          train_feat1['血糖'],
                          cat_features=[20])
    val = catboost.Pool(train_feat2[predictors],
                        train_feat2['血糖'],
                        cat_features=[20])
    model = catboost.CatBoost(params=cat_params)
    model.fit(X=train,
              eval_set=val,
              use_best_model=True,
              logging_level='Verbose')
    train_preds[test_index] += model.predict(val)
    test_preds[:, i] = model.predict(test)
print('线下得分:    {}'.format(
    mean_squared_error(train_feat['血糖'], train_preds) * 0.5))
Esempio n. 29
0
def train(
    config: TrainConfig,
    train_features: pd.DataFrame,
    test_features: pd.DataFrame,
    products_enriched: pd.DataFrame,
    train_gt_items_count: pd.DataFrame,
    test_gt_items_count: pd.DataFrame,
):

    train_features = pd.merge(train_features, products_enriched, how='left')
    test_features = pd.merge(test_features, products_enriched, how='left')

    columns_diff = set(train_features.columns) - set(cols)
    logger.info(f'columns not used: {columns_diff}')
    for df in (train_features, test_features):
        df['target'] = df['target'].fillna(0).astype(int)
        df.segment_id = df.segment_id.fillna(0).astype(int)
        for col in cat_cols:
            df[col] = df[col].fillna(0)

    clients_ids = train_features.client_id.unique()
    sample_clients = np.random.choice(clients_ids,
                                      size=len(clients_ids) * 9 // 10,
                                      replace=False)

    train_df = train_features[train_features.client_id.isin(sample_clients)]
    val_df = train_features[~train_features.client_id.isin(sample_clients)]
    test_df = test_features

    groups = {}
    for name, df in (('train', train_df), ('val', val_df), ('test', test_df)):
        client_id_map = {
            client_id: i
            for i, client_id in enumerate(df['client_id'].unique())
        }
        groups[name] = df['client_id'].map(client_id_map).values

    train_pool = cb.Pool(train_df[cols],
                         train_df['target'],
                         cat_features=cat_cols,
                         group_id=groups['train'])
    val_pool = cb.Pool(val_df[cols],
                       val_df['target'],
                       cat_features=cat_cols,
                       group_id=groups['val'])
    test_pool = cb.Pool(test_df[cols],
                        test_df['target'],
                        cat_features=cat_cols,
                        group_id=groups['test'])

    model = cb.CatBoost(config.catboost.train_params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)
    model.save_model(config.catboost.model_file)
    # validate
    gt_cnt_map = {
        'val': train_gt_items_count,
        'test': test_gt_items_count,
    }
    for name, df, pool in (('val', val_df, val_pool), ('test', test_df,
                                                       test_pool)):
        gt_items_cnt = gt_cnt_map[name]
        df['score'] = model.predict(pool)
        for order in ('score', 'target'):
            scoring = (df[['client_id', 'product_id', 'score',
                           'target']].merge(gt_items_cnt).sort_values(
                               ['client_id', order], ascending=[True, False]))

            scoring['rank'] = scoring.groupby('client_id').cumcount() + 1
            scoring['cum_target'] = scoring.groupby(
                'client_id')['target'].cumsum()
            scoring['prec'] = ((scoring['rank'] <= 30) * scoring['target'] *
                               (scoring['cum_target'] / scoring['rank']) /
                               scoring['gt_count']).fillna(0)

            score = scoring.groupby('client_id').prec.sum().fillna(0).mean()
            logger.info(f'[{name}] order by {order} : {score}')
Esempio n. 30
0
from lib.hardcode import TOP_ITEMS
from lib.logger import configure_logger
from lib.product_store_features import ProductStoreStats
from lib.recommender import CatBoostRecommenderWithPopularFallback, cols
from lib.utils import read_products_file, pickle_load

logger = configure_logger(logger_name='server', log_dir='')

logger.info('starting to load all stuff')
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
config = TrainConfig.from_json('configs/config.json')

app = Flask(__name__)
app.products_data = read_products_file(config.products_enriched_file)

app.model = catboost.CatBoost()
app.model.load_model(config.catboost.model_file)
app.item_vectors = pickle_load(config.implicit.vectors_file)
with open(config.implicit.model_file, 'rb') as f:
    app.implicit_model = pickle.load(f)

app.product_store_stats = ProductStoreStats()

app.recommender = CatBoostRecommenderWithPopularFallback(
    model=app.model,
    implicit_model=app.implicit_model,
    item_vectors=app.item_vectors,
    products_data=app.products_data,
    product_store_stats=app.product_store_stats,
    feature_names=cols,
)