def generate_submit(self, num_boost_round=None, from_model_saved=False): assert num_boost_round is not None if not from_model_saved: dtrain = self.get_train_set(as_cgb_pool=True) booster = cgb.train( dtrain=dtrain, params=self.params_best_fit, num_boost_round=num_boost_round) self.save_model(booster) else: booster = cgb.CatBoost(model_file=from_model_saved) dftest = self.get_test_set(as_cgb_pool=True) with Timer("Predicting"): probas = booster.predict(dftest, prediction_type="Probability") dfpred = pd.DataFrame(probas)[[1]] # Get proba classe one dfpred = dfpred.rename(columns={1: 'target'}) now = pd.Timestamp.now(tz='CET').strftime("%d-%Hh-%Mm") fpath = RESULT_DIR / "catboost_submit_{}.csv".format(now) with Timer('Storing in {}'.format(fpath)): dfpred.to_csv(fpath, index=False)
def load(cls, path): try: model = catboost.CatBoost().load_model( os.path.join(path, 'cat_ranking.cbm')) return cls(model=model) except NameError: # CatBoost is unavailable. Try to load Python model. pass
def main(): # load dataset df = pd.read_csv(DATA_PATH) target = 'ACTION' features = [column for column in df.columns if column != target] # fit model model = catboost.CatBoost({ 'loss_function': 'Logloss', 'verbose': False, 'random_seed': 0 }) model.fit(df[features], df[target]) model.save_model(MODEL_PATH) # predict on sample df_sample = pd.read_csv(DATA_SAMPLE_PATH) predicts = model.predict(df_sample) predicts = np.power(1 + np.exp(-predicts), -1) pd.DataFrame({ 'x': predicts }).to_csv(DATA_SAMPLE_PREDICT_PATH, index=False, header=False) # predict on one sample print('Parameters:') r = df_sample[:1].to_dict('records') for k, v in r[0].items(): print(f'input.put("{k}", {v}.0);') print('Expected predict:') print(np.power(1 + np.exp(-model.predict(df_sample[:1])[0]), -1))
def fit(self, train_df, valid_df=None, train_dir='.', niter=10000, seed=123): if self.model is None: params = { 'loss_function': 'RMSE', 'task_type': 'GPU', 'iterations': niter, 'verbose': True, 'train_dir': train_dir, 'random_seed': seed } self.model = catboost.CatBoost(params) init_model = None else: init_model = self.model train_features, train_labels = get_feature_label(train_df) train_pool = catboost.Pool(data=train_features, label=train_labels) if valid_df is not None: valid_features, valid_labels = get_feature_label(valid_df) dev_pool = catboost.Pool(data=valid_features, label=valid_labels) else: dev_pool = None self.model.fit(train_pool, eval_set=dev_pool, init_model=init_model)
def fit_predict_single_fold( self, train: TabularDataset, valid: TabularDataset) -> Tuple[cb.CatBoost, np.ndarray]: """Implements training and prediction on single fold. Args: train: Train Dataset. valid: Validation Dataset. Returns: Tuple (model, predicted_values). """ params, num_trees, early_stopping_rounds, fobj, feval = self._infer_params( ) cb_train = self._get_pool(train) cb_valid = self._get_pool(valid) model = cb.CatBoost({ **params, **{ 'num_trees': num_trees, 'objective': fobj, 'eval_metric': feval, "od_wait": early_stopping_rounds } }) model.fit(cb_train, eval_set=cb_valid) val_pred = self._predict(model, cb_valid, params) return model, val_pred
def state_set(self, state, trusted=True): super(CatBoostModel, self).state_set(state['substate']) data = base64.decodebytes(state['tree_state'].encode('ascii')) filename = tempfile.mktemp() with open(filename, 'wb') as f: f.write(data) self.booster = catboost.CatBoost().load_model(fname=filename)
def _fit(self, tunable_params): params = Learner._fit(self, tunable_params) if 'nthread' in params: params['thread_count'] = params['nthread'] del params['nthread'] self.model = cat.CatBoost(params) self.model.fit(self.train, eval_set=self.test, verbose_eval=True)
def fit(self, data, args): dtrain = cat.Pool(data.X_train, data.y_train) params = self.configure(data, args) params["iterations"] = args.ntrees self.model = cat.CatBoost(params) with Timer() as t: self.model.fit(dtrain) return t.interval
def load(self, filename: str) -> None: if self._is_lightgbm: self._model = lgb.Booster(model_file=filename) elif self._is_xgboost: self._model = xgb.Booster() self._model.load_model(filename) elif self._is_catboost: self._model = cat.CatBoost() self._model.load_model(filename)
def full_predict(df, model_params, general_params): pool = catboost.Pool(df["text"].values, text_features=[0]) model_params = copy.deepcopy(model_params) model_params.update({"train_dir": general_params["logdir"]}) model = catboost.CatBoost(model_params) model.load_model(os.path.join(general_params["logdir"], "model.cbm")) return model.predict(pool, prediction_type="Probability")
def train(self, x_train, y_train, x_cross=None, y_cross=None, cat_features=None): """ x_cross or y_cross is None -> model train limted num_rounds x_cross and y_cross is Not None -> model train using validation set """ if isinstance(y_train, pd.DataFrame) is True: y_train = y_train[y_train.columns[0]] if y_cross is not None: y_cross = y_cross[y_cross.columns[0]] if x_cross is None: dtrain = cat.Pool(x_train, y_train, cat_features=cat_features) train_round = self.best_round if self.best_round == 0: train_round = self.num_rounds self.clf = cat.CatBoost(params=self.param) self.clf.fit(dtrain, verbose_eval=self.verbose_eval) del dtrain else: dtrain = cat.Pool(x_train, y_train, cat_features=cat_features) dvalid = cat.Pool(x_cross, y_cross, cat_features=cat_features) self.clf = cat.CatBoost(params=self.param) self.clf.fit(dtrain, eval_set=[dvalid], early_stopping_rounds=self.early_stopping, verbose_eval=self.verbose_eval) self.best_round = max(self.best_round, self.clf.best_iteration_) del dtrain, dvalid gc.collect()
def cat_single( train, test, target_name, features, params, metric, other_params, fold_num=None, plot_importance=True, print_result=True, predict_train=True, ): dtrain, dval, X_train, X_val = get_data_set( train, target_name, features, other_params["cat_features"],other_params["weight"],other_params ,fold_num ) dtest = cat.Pool( data=test[features], feature_names=features, cat_features=other_params["cat_features"], ) model = cat.CatBoost(params) model.fit(dtrain, eval_set=dval, use_best_model=True) train_pred = ( model.predict(dtrain, other_params["prediction_type"]) if predict_train else None ) val_pred = model.predict(dval, prediction_type=other_params["prediction_type"]) test_pred = model.predict(dtest, prediction_type=other_params["prediction_type"]) if ((other_params["prediction_type"]=="Probability")& (other_params["num_class"]==1)) : if predict_train : train_pred=train_pred[:,1] val_pred=val_pred[:,1] test_pred=test_pred[:,1] val_score = metric(X_val[target_name], val_pred) train_score = metric(X_train[target_name], train_pred) if predict_train else -1 if print_result: print( "final train score : {} -validation score: {}".format( str(round(train_score, 5)), str(round(val_score, 5)) ) ) importances = [model.get_feature_importance(prettified=True)] if plot_importance: _plot_importance_(importances, features) return train_pred, val_pred, test_pred, train_score, val_score, importances, model
def __init__(self, config): self.log = logging.getLogger("CatboostModel") self.log.info("model config: {0}".format(config)) self.config = config catboost_parameters = { 'iterations': config["iterations"], 'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=5'], 'random_seed': 0, 'loss_function': config["loss_function"] } self.model = cb.CatBoost(catboost_parameters) self.log.info("inited")
def __init__(self, **kwargs): self.params = {} self.is_fitted = False for key, value in kwargs.items(): self.params[key] = value self.model = cb.CatBoost(self.params) if self.params['objective'] in [ 'auc', 'binary_logloss', 'multi_logloss' ]: self.get_best_metric = max else: self.get_best_metric = min
def main(readcsv=pd_read_csv, method='defaultDense'): # Path to data train_file = "./data/batch/df_classification_train.csv" test_file = "./data/batch/df_classification_test.csv" # Data reading X_train = readcsv(train_file, range(3), t=np.float32) y_train = readcsv(train_file, range(3, 4), t=np.float32) X_test = readcsv(test_file, range(3), t=np.float32) y_test = readcsv(test_file, range(3, 4), t=np.float32) # Datasets creation cb_train = cb.Pool(X_train, label=np.array(y_train)) cb_test = cb.Pool(X_test, label=np.array(y_test)) # training parameters setting params = { 'reg_lambda': 1, 'max_depth': 8, 'num_leaves': 2**8, 'verbose': 0, 'objective': 'MultiClass', 'learning_rate': 0.3, 'n_estimators': 100, 'classes_count': 5, } # Training cb_model = cb.CatBoost(params) cb_model.fit(cb_train) # Catboost prediction cb_prediction = cb_model.predict(cb_test, prediction_type='Class').T[0] cb_errors_count = np.count_nonzero(cb_prediction - np.ravel(y_test)) # Conversion to daal4py daal_model = d4p.get_gbt_model_from_catboost(cb_model) # daal4py prediction daal_predict_algo = d4p.gbt_classification_prediction( nClasses=params['classes_count'], resultsToEvaluate="computeClassLabels", fptype='float') daal_prediction = daal_predict_algo.compute(X_test, daal_model) daal_errors_count = np.count_nonzero(daal_prediction.prediction - y_test) assert np.absolute(cb_errors_count - daal_errors_count) == 0 return (cb_prediction, cb_errors_count, np.ravel(daal_prediction.prediction), daal_errors_count, np.ravel(y_test))
def gen_test(fnm, dataset, iterations=10, learning_rate=0.1, loss="RMSE"): features, labels = dataset() model = cb.CatBoost({ "learning_rate": learning_rate, "iterations": iterations, "loss_function": loss }) model.fit(features, y=labels) model.save_model(fnm + "-model.json", format="json") width = features.shape[1] x = np.random.rand(10, width) y = model.predict(x) with open(fnm + ".json", "wt") as f: json.dump({"x": x.tolist(), "y": y.tolist()}, f)
def __init__(self, products: pd.DataFrame, params_rec: dict, params_catboost: dict, catboost_features=CB_FEATURES): self.ranker = catboost.CatBoost(params_catboost) self._catboost_features = catboost_features self._nan_fill_dict = dict() self.recommender = CosineRecommender(**params_rec) self._product_idx = dict(zip(products.product_id, range(len(products)))) self._idx_product = products.product_id.tolist() self._product_features = { row['product_id']: dict(row.drop(index='product_id')) for (i, row) in products.iterrows() }
def fit(self, params, dtrain, dtest, max_n_estimators, n_estimators=None, early_stopping=False, seed=0): if self.gpu_id: params.update({"task_type": 'GPU'}) params.update({'devices': self.gpu_id}) if early_stopping: params.update({"od_wait": 100}) params.update({"iterations": n_estimators if n_estimators else max_n_estimators}) params.update({"random_seed": seed}) bst = cb.CatBoost(params) bst.fit(dtrain, eval_set=dtest) results = bst.evals_result_['validation_0']['RMSE'] if self.learning_task == 'regression' \ else bst.evals_result_['validation_0']['Logloss'] return ( bst.best_score_['validation_0']['RMSE'] if early_stopping else results[-1], bst.best_iteration_ if early_stopping else (len(results) - 1), results )
def train( self, x_train: Union[pd.DataFrame, np.ndarray], y_train: Union[pd.DataFrame, np.ndarray], x_valid: Union[pd.DataFrame, np.ndarray], y_valid: Union[pd.DataFrame, np.ndarray], cat_features: Optional[List[Union[str, int]]] = None, feature_names: Optional[List[str]] = None ) -> None: if not isinstance(x_train, pd.DataFrame) and feature_names is None: raise ValueError('Feature names are not specified. Use pd.DataFrame for inputs or pass feature_names') if self._is_lightgbm: train_data = lgb.Dataset(x_train, label=y_train) valid_data = lgb.Dataset(x_valid, label=y_valid) self._model = lgb.train( self._params_as_params(self._params), train_data, valid_names=['train', 'valid'], valid_sets=[train_data, valid_data], **self._params_as_kwargs(self._params) ) elif self._is_xgboost: feature_names = feature_names or x_train.columns train_data = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names) valid_data = xgb.DMatrix(x_valid, label=y_valid, feature_names=feature_names) self._model = xgb.train( self._params_as_params(self._params), train_data, evals=[(train_data, 'train'), (valid_data, 'valid')], **self._params_as_kwargs(self._params) ) elif self._is_catboost: train_data = cat.Pool(x_train, y_train, cat_features=cat_features) valid_data = cat.Pool(x_valid, y_valid, cat_features=cat_features) self._model = cat.CatBoost( self._params_as_params(self._params), ) self._model.fit( train_data, eval_set=valid_data, use_best_model=True, **self._params_as_kwargs(self._params) )
def fit(self, train_df, step_sample_num=204800, group_size=40, niter=5000, train_dir='.', seed=123): if self.model is not None: init_model = self.model else: init_model = None params = { 'loss_function': 'YetiRank', 'task_type': 'GPU', 'iterations': niter, 'verbose': True, 'train_dir': train_dir, 'random_seed': seed } num_fit_calls = 1 step_sample_num = min(step_sample_num, len(train_df) * 5) self.model = catboost.CatBoost(params) features, thrpt = get_feature_label(train_df) sampler = CatBoostPoolIndicesGenerator(thrpt, sample_num=step_sample_num, group_size=group_size) for i in range(num_fit_calls): indices = sampler() step_features = np.take(features, indices, axis=0) step_thrpt = np.take(thrpt, indices, axis=0) if self.normalize_relevance: step_thrpt = step_thrpt / ( np.max(step_thrpt, axis=-1, keepdims=True) + 1E-6) step_groups = np.broadcast_to( np.arange(step_thrpt.shape[0]).reshape((-1, 1)), step_thrpt.shape) train_pool = catboost.Pool(data=step_features.reshape( (-1, step_features.shape[-1])), label=step_thrpt.reshape((-1, )), group_id=step_groups.reshape((-1, ))) self.model.fit(train_pool, init_model=init_model)
def train(self, X_train, y_train, X_valid=None, y_valid=None, params=None, **kwargs): train_data = cat.Pool(X_train, y_train, cat_features=None) watchlist = [train_data] if X_valid is not None and y_valid is not None: valid_data = cat.Pool(X_valid, y_valid, cat_features=None) watchlist.append(valid_data) stopping_rounds = 10 else: stopping_rounds = None self.mod = cat.CatBoost(params=params) self.mod.fit(X=train_data, eval_set=watchlist, early_stopping_rounds=stopping_rounds, verbose=False, **kwargs) return self.mod
def load_models(self, model_path): """Load models.""" valid_net_file = '{}/valid_model.cbm'.format(model_path) rank_net_cbm = '{}/list_rank_net.cbm'.format(model_path) rank_net_py = '{}/list_rank_net.py'.format(model_path) self.path = model_path self.is_cbm_model = True with warnings.catch_warnings(): warnings.simplefilter("ignore") if os.path.exists(valid_net_file): self.valid_net = catboost.CatBoostClassifier().load_model(valid_net_file) else: self.valid_net = None try: self.rank_net = catboost.CatBoost().load_model(rank_net_cbm) except NameError: # CatBoost is unavailable. Try to load Python model. self.is_cbm_model = False with open(rank_net_py, 'rb') as filep: self.rank_net = imp.load_module( model_path.replace('/', '_').replace('.', '_'), filep, 'list_rank_net.py', ('.py', 'rb', imp.PY_SOURCE))
def eval(self, params, train_file, test_file, seed=0, train_query_fname=None, test_query_fname=None,\ early_stopping_rounds=None, num_rounds=None): prefix = "libsvm://" train_group_id = self._get_group_id_from_file(train_query_fname) test_group_id = self._get_group_id_from_file(test_query_fname) train_data = cat.Pool(prefix + train_file, column_description=self.column_description_fname) test_data = cat.Pool(prefix + test_file, column_description=self.column_description_fname) if train_group_id is not None: assert test_group_id is not None train_data.set_group_id(train_group_id) test_data.set_group_id(test_group_id) self.fullfill_parameters(params, seed) if num_rounds is not None: params["iterations"] = num_rounds print("eval with params " + str(params)) cat_booster = cat.CatBoost(params=params) try: cat_booster.fit(train_data, eval_set=[test_data], verbose=1, early_stopping_rounds=early_stopping_rounds) except Exception as err: print("error message: ", err) eval_results = cat_booster.get_evals_result() results = eval_results["validation"][self.metric] if num_rounds is not None and len(results) < num_rounds: eval_result_list = [] for result in results: eval_result_list += [result] for _ in range(len(results), num_rounds): eval_result_list += [results[-1]] results = np.array(eval_result_list) train_data = None test_data = None return cat_booster, results
y = pd.Series(iris.target) return X, y def read_yaml(path): with open(path, "r") as f: return yaml.safe_load(f) MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10} @pytest.fixture( scope="module", params=[ cb.CatBoost(MODEL_PARAMS), cb.CatBoostClassifier(**MODEL_PARAMS), cb.CatBoostRegressor(**MODEL_PARAMS), ], ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"], ) def cb_model(request): model = request.param X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X) @pytest.fixture def reg_model(): model = cb.CatBoostRegressor(**MODEL_PARAMS) X, y = get_iris()
def train_ranking_catboost(args, train_df, test_df): import catboost params = { 'loss_function': args.rank_loss_function, 'custom_metric': ['NDCG', 'AverageGain:top=10'], 'task_type': 'GPU', 'iterations': args.niter, 'verbose': True, 'train_dir': args.out_dir, 'random_seed': args.seed } train_dev_features, train_dev_labels = get_feature_label(train_df) test_features, test_labels = get_feature_label(test_df) # Split Train/Dev shuffle_idx = np.random.permutation(train_dev_features.shape[0]) train_dev_features, train_dev_labels = \ train_dev_features[shuffle_idx], train_dev_labels[shuffle_idx] num_train = train_dev_features.shape[0] - int( args.dev_ratio * train_dev_features.shape[0]) train_features, train_labels = train_dev_features[: num_train], train_dev_labels[: num_train] dev_features, dev_labels = train_dev_features[ num_train:], train_dev_labels[num_train:] total_data_size = len(train_df) + len(test_df) get_sample_size = lambda ratio: \ int(min(total_data_size, args.max_data_size) * ratio * args.sample_mult) dev_sample_size = get_sample_size(args.dev_ratio * (1 - args.test_ratio)) train_sample_size = get_sample_size( (1 - args.dev_ratio) * (1 - args.test_ratio)) test_sample_size = get_sample_size(args.test_ratio) # Generate the training/testing samples for ranking. # We divide the samples into multiple bins and will do stratified sampling within each bin. sorted_train_ids = np.argsort(train_labels) train_group_ids_list = np.array_split(sorted_train_ids, args.num_threshold_bins) sorted_dev_ids = np.argsort(dev_labels) dev_group_ids_list = np.array_split(sorted_dev_ids, args.num_threshold_bins) sorted_test_ids = np.argsort(test_labels) test_group_ids_list = np.array_split(sorted_test_ids, args.num_threshold_bins) train_rank_features = [] train_rank_labels = [] train_groups = [] dev_rank_features = [] dev_rank_labels = [] dev_groups = [] test_rank_features = [] test_rank_labels = [] test_groups = [] train_npz_file = os.path.join(args.out_dir, 'train_rank_features.npz') dev_npz_file = os.path.join(args.out_dir, 'dev_rank_features.npz') test_npz_file = os.path.join(args.out_dir, 'test_rank_features.npz') if os.path.exists(test_npz_file): print('Loading features from npz') assert os.path.exists(train_npz_file) assert os.path.exists(dev_npz_file) npzfile = np.load(train_npz_file) train_rank_features = npzfile['train_rank_features'] train_rank_labels = npzfile['train_rank_labels'] train_groups = npzfile['train_groups'] npzfile = np.load(dev_npz_file) dev_rank_features = npzfile['dev_rank_features'] dev_rank_labels = npzfile['dev_rank_labels'] dev_groups = npzfile['dev_groups'] npzfile = np.load(test_npz_file) test_rank_features = npzfile['test_rank_features'] test_rank_labels = npzfile['test_rank_labels'] test_groups = npzfile['test_groups'] else: print('Generate Dev Ranking Groups:') for i in tqdm.tqdm(range(dev_sample_size)): for group_ids in dev_group_ids_list: chosen_ids = np.random.choice(group_ids, args.group_size // args.num_threshold_bins, replace=False) dev_rank_features.append(dev_features[chosen_ids, :]) dev_rank_labels.append(dev_labels[chosen_ids]) dev_groups.append(np.ones_like(chosen_ids) * i) dev_rank_features = np.concatenate(dev_rank_features, axis=0) dev_rank_labels = np.concatenate(dev_rank_labels, axis=0) dev_groups = np.concatenate(dev_groups, axis=0) np.savez(os.path.join(args.out_dir, 'dev_rank_features.npz'), dev_rank_features=dev_rank_features, dev_rank_labels=dev_rank_labels, dev_groups=dev_groups) print('Generate Train Ranking Groups:') for i in tqdm.tqdm(range(train_sample_size)): for group_ids in train_group_ids_list: chosen_ids = np.random.choice(group_ids, args.group_size // args.num_threshold_bins, replace=False) train_rank_features.append(train_features[chosen_ids, :]) train_rank_labels.append(train_labels[chosen_ids]) train_groups.append(np.ones_like(chosen_ids) * i) train_rank_features = np.concatenate(train_rank_features, axis=0) train_rank_labels = np.concatenate(train_rank_labels, axis=0) train_groups = np.concatenate(train_groups, axis=0) np.savez(os.path.join(args.out_dir, 'train_rank_features.npz'), train_rank_features=train_rank_features, train_rank_labels=train_rank_labels, train_groups=train_groups) test_rng = np.random.RandomState(args.test_seed) print('Generate Test Ranking Groups:') for i in tqdm.tqdm(range(test_sample_size)): for group_ids in test_group_ids_list: chosen_ids = test_rng.choice(group_ids, args.group_size // args.num_threshold_bins, replace=False) test_rank_features.append(test_features[chosen_ids, :]) test_rank_labels.append(test_labels[chosen_ids]) test_groups.append(np.ones_like(chosen_ids) * i) test_rank_features = np.concatenate(test_rank_features, axis=0) test_rank_labels = np.concatenate(test_rank_labels, axis=0) test_groups = np.concatenate(test_groups, axis=0) np.savez(os.path.join(args.out_dir, 'test_rank_features.npz'), test_rank_features=test_rank_features, test_rank_labels=test_rank_labels, test_groups=test_groups) train_pool = catboost.Pool(data=train_rank_features, label=train_rank_labels, group_id=train_groups) dev_pool = catboost.Pool(data=dev_rank_features, label=dev_rank_labels, group_id=dev_groups) # test_pool = catboost.Pool(data=test_rank_features, # label=test_rank_labels, # group_id=test_groups) model = catboost.CatBoost(params) model.fit(train_pool, eval_set=dev_pool) predict_result = model.predict(test_rank_features) test_gt_scores = test_rank_labels.reshape(test_sample_size, args.group_size) predict_result = predict_result.reshape( (test_sample_size, args.group_size)) np.save(os.path.join(args.out_dir, 'test_predictions.npy'), predict_result) test_ndcg_score = ndcg_score(y_true=test_gt_scores, y_score=predict_result) logging.info('Test NDCG=%f', test_ndcg_score) model.save_model(os.path.join(args.out_dir, 'list_rank_net.cbm')) model.save_model(os.path.join(args.out_dir, 'list_rank_net'), format='python')
def _fit(self, tunable_params): params = Learner._fit(self, tunable_params) self.model = cat.CatBoost(params) self.model.fit(self.train, eval_set=self.test, verbose_eval=True)
if __name__ == "__main__": def load_part(part_id): raw_samples = pickle.load( open( "../tmp/features/candidates_train_{:02d}.pickled".format( part_id), "rb")) return [limit_candidates_number(s, limit=1000) for s in raw_samples] part12 = load_part(12) part13 = load_part(13) train_pool = build_pool(part12[:20000] + part13) valid_pool = build_pool(part12[20000:]) parameters = { "iterations": 200, "learning_rate": 0.05, "depth": 4, "loss_function": "PairLogit", "custom_metric": ["MAP:top=30", "RecallAt:top=250"], "verbose": False, "random_seed": 44, "task_type": "GPU", "max_ctr_complexity": 0, } model = catboost.CatBoost(parameters) model.fit(train_pool, eval_set=valid_pool, plot=True) model.save_model("../tmp/model_v4.3_candidates.cbm")
'depth': 6, 'l2_leaf_reg': 3, 'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'random_seed': 2018 } #K折交叉验证 train_preds = np.zeros(train_feat.shape[0]) test_preds = np.zeros((test_feat.shape[0], 5)) kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520) #5折交叉验证 test = catboost.Pool(test_feat[predictors], cat_features=[20]) for i, (train_index, test_index) in enumerate(kf): print('第 {} 次训练...'.format(i)) train_feat1 = train_feat.iloc[train_index] #训练 train_feat2 = train_feat.iloc[test_index] #验证 train = catboost.Pool(train_feat1[predictors], train_feat1['血糖'], cat_features=[20]) val = catboost.Pool(train_feat2[predictors], train_feat2['血糖'], cat_features=[20]) model = catboost.CatBoost(params=cat_params) model.fit(X=train, eval_set=val, use_best_model=True, logging_level='Verbose') train_preds[test_index] += model.predict(val) test_preds[:, i] = model.predict(test) print('线下得分: {}'.format( mean_squared_error(train_feat['血糖'], train_preds) * 0.5))
def train( config: TrainConfig, train_features: pd.DataFrame, test_features: pd.DataFrame, products_enriched: pd.DataFrame, train_gt_items_count: pd.DataFrame, test_gt_items_count: pd.DataFrame, ): train_features = pd.merge(train_features, products_enriched, how='left') test_features = pd.merge(test_features, products_enriched, how='left') columns_diff = set(train_features.columns) - set(cols) logger.info(f'columns not used: {columns_diff}') for df in (train_features, test_features): df['target'] = df['target'].fillna(0).astype(int) df.segment_id = df.segment_id.fillna(0).astype(int) for col in cat_cols: df[col] = df[col].fillna(0) clients_ids = train_features.client_id.unique() sample_clients = np.random.choice(clients_ids, size=len(clients_ids) * 9 // 10, replace=False) train_df = train_features[train_features.client_id.isin(sample_clients)] val_df = train_features[~train_features.client_id.isin(sample_clients)] test_df = test_features groups = {} for name, df in (('train', train_df), ('val', val_df), ('test', test_df)): client_id_map = { client_id: i for i, client_id in enumerate(df['client_id'].unique()) } groups[name] = df['client_id'].map(client_id_map).values train_pool = cb.Pool(train_df[cols], train_df['target'], cat_features=cat_cols, group_id=groups['train']) val_pool = cb.Pool(val_df[cols], val_df['target'], cat_features=cat_cols, group_id=groups['val']) test_pool = cb.Pool(test_df[cols], test_df['target'], cat_features=cat_cols, group_id=groups['test']) model = cb.CatBoost(config.catboost.train_params) model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100) model.save_model(config.catboost.model_file) # validate gt_cnt_map = { 'val': train_gt_items_count, 'test': test_gt_items_count, } for name, df, pool in (('val', val_df, val_pool), ('test', test_df, test_pool)): gt_items_cnt = gt_cnt_map[name] df['score'] = model.predict(pool) for order in ('score', 'target'): scoring = (df[['client_id', 'product_id', 'score', 'target']].merge(gt_items_cnt).sort_values( ['client_id', order], ascending=[True, False])) scoring['rank'] = scoring.groupby('client_id').cumcount() + 1 scoring['cum_target'] = scoring.groupby( 'client_id')['target'].cumsum() scoring['prec'] = ((scoring['rank'] <= 30) * scoring['target'] * (scoring['cum_target'] / scoring['rank']) / scoring['gt_count']).fillna(0) score = scoring.groupby('client_id').prec.sum().fillna(0).mean() logger.info(f'[{name}] order by {order} : {score}')
from lib.hardcode import TOP_ITEMS from lib.logger import configure_logger from lib.product_store_features import ProductStoreStats from lib.recommender import CatBoostRecommenderWithPopularFallback, cols from lib.utils import read_products_file, pickle_load logger = configure_logger(logger_name='server', log_dir='') logger.info('starting to load all stuff') ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) config = TrainConfig.from_json('configs/config.json') app = Flask(__name__) app.products_data = read_products_file(config.products_enriched_file) app.model = catboost.CatBoost() app.model.load_model(config.catboost.model_file) app.item_vectors = pickle_load(config.implicit.vectors_file) with open(config.implicit.model_file, 'rb') as f: app.implicit_model = pickle.load(f) app.product_store_stats = ProductStoreStats() app.recommender = CatBoostRecommenderWithPopularFallback( model=app.model, implicit_model=app.implicit_model, item_vectors=app.item_vectors, products_data=app.products_data, product_store_stats=app.product_store_stats, feature_names=cols, )