def test_silent(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): model = CatBoost(dict(iterations=10, silent=True)) model.fit(pool) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): model = CatBoost(dict(iterations=10, silent=True)) model.fit(pool, silent=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) with LogStdout(open(tmpfile, 'w')): train(pool, {'silent': True}) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): model = CatBoost(dict(iterations=10, silent=False)) model.fit(pool, silent=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): model = CatBoost(dict(iterations=10, verbose=5)) model.fit(pool, silent=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0)
def cat_cv(train, test, params, fit_params, cat_features, feature_names, nfold, seed): train.Pred = pd.DataFrame({ 'id': train['样本id'], 'true': train['收率'], 'pred': np.zeros(len(train)) }) test.Pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))}) kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed) cat_tst = cat.Pool(data=test[feature_names], cat_features=cat_features, feature_names=feature_names) for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])): print(f'\nFold_{fold_id} Training ================================\n') cat_trn = cat.Pool(data=train.iloc[trn_idx][feature_names], label=train.iloc[trn_idx]['收率'], cat_features=cat_features, feature_names=feature_names) cat_val = cat.Pool(data=train.iloc[val_idx][feature_names], label=train.iloc[val_idx]['收率'], cat_features=cat_features, feature_names=feature_names) cat.train(params=params, pool=cat_trn, **fit_params, eval_set=cat_val) val_pred = cat.predict(train.iloc[val_idx][feature_names]) train.Pred.loc[val_idx, 'pred'] = val_pred print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred)) test.Pred['pred'] += cat.predict(cat_tst) / nfold print('\n\nCV LOSS:', mse(train.Pred['true'], train.Pred['pred'])) return test.Pred
def test_verbose_int(verbose): expected_line_count = {5: 2, False: 0, True: 10} pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, { "iterations": 10, "random_seed": 0, "loss_function": "Logloss" }, verbose=verbose) with open(tmpfile, 'r') as output: assert (sum(1 for line in output) == expected_line_count[verbose]) with LogStdout(open(tmpfile, 'w')): train(pool, { "iterations": 10, "random_seed": 0, "loss_function": "Logloss" }, verbose=verbose) with open(tmpfile, 'r') as output: assert (sum(1 for line in output) == expected_line_count[verbose]) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_catboost_numerical_validation(): ds = vaex.ml.datasets.load_iris() features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] # Vanilla catboost dtrain = cb.Pool(ds[features].values, label=ds.data.class_) cb_bst = cb.train(params=params_multiclass, dtrain=dtrain, num_boost_round=3) cb_pred = cb_bst.predict(dtrain, prediction_type='Probability') # catboost through vaex booster = vaex.ml.catboost.CatBoostModel(features=features, params=params_multiclass, num_boost_round=3) booster.fit(ds, ds.class_) vaex_pred = booster.predict(ds) # Comparing the the predictions of catboost vs vaex.ml np.testing.assert_equal( vaex_pred, cb_pred, verbose=True, err_msg= 'The predictions of vaex.ml.catboost do not match those of pure catboost' )
def cgb_cv(df, features, categorical_features, n_folds, param): kf = GroupKFold(n_splits=n_folds) group_map = dict(zip(np.arange(1, 13), pd.cut(np.arange(1, 13), n_folds, labels=np.arange(n_folds)))) group = df.timestamp.dt.month.map(group_map) models = [] train_scores = [] valid_scores = [] for train_index, val_index in kf.split(df, df['building_id'], groups=group): train_X, train_y = df[features].iloc[train_index], df['meter_reading'].iloc[train_index] val_X, val_y = df[features].iloc[val_index], df['meter_reading'].iloc[val_index] cgb_train = cgb.Pool(train_X, train_y, cat_features=categorical_features) cgb_eval = cgb.Pool(val_X, val_y, cat_features=categorical_features) gbm = cgb.train(cgb_train, param, eval_set=cgb_eval, verbose=20) train_preds = gbm.predict(train_X) if use_log1p_target: train_preds = np.expm1(train_preds) train_y = np.expm1(train_y) train_scores.append(rmsle(train_y, train_preds)) valid_preds = gbm.predict(val_X) if use_log1p_target: valid_preds = np.expm1(valid_preds) val_y = np.expm1(val_y) valid_scores.append(rmsle(val_y, valid_preds)) models.append(gbm) return train_scores, valid_scores, models
def fit(self, data, clf=None): setseed(self.seed) train_df = data[data['visitId'].\ apply(lambda x: x.date()) >= datetime.date(2016,9,30)] val_df = data[data['visitId'].\ apply(lambda x: x.date()) < datetime.date(2016,9,30)] if clf: train_X, train_y = \ train_df[self.features_name].values,\ train_df['validRevenue'].values\ val_X, val_y = \ val_df[self.features_name].values,\ val_df['validRevenue'].values else: train_X, train_y = \ train_df[self.features_name].values,\ train_df['totals_transactionRevenue'].values val_X, val_y = \ val_df[self.features_name].values,\ val_df['totals_transactionRevenue'].values # x_train, x_eval, y_train, y_eval = \ # train_test_split(df_x, df_y, # test_size = self.test_ratio, # random_state = self.seed) cat_train = cat.Pool(data=train_X, label=train_y, feature_names=self.features_name, cat_features=self.categorical_feature) cat_eval = cat.Pool(data=val_X, label=val_y, feature_names=self.features_name, cat_features=self.categorical_feature) model_param = self.params['params_clf'] if clf else self.params[ 'params_reg'] self.estimator = cat.train( params=model_param, pool=cat_train, eval_set=cat_eval, # num_boost_round = self.num_boost_round, # learning_rate = self.params['learning_rate'], # max_depth = self.params['max_depth'], # l2_leaf_reg = self.params['l2_leaf_reg'], # rsm = self.params['colsample_ratio'], # subsample = self.params['subsample_ratio'], # class_weights = self.params['class_weights'], # loss_function = self.loglikeloss, # custom_loss = self.loglikeloss, # custom_metric = self.roc_auc_error, # eval_metric = self.roc_auc_error ) return self
def cat_cv_train(X_train, y_train, params, kfold): ''' taking feature and target dataset and using cross-validation method to train model, return list of classifiers, length of list is the number of clfs for each fold. ''' from catboost import train, Pool from sklearn.model_selection import train_test_split, KFold features = [feature for feature in X_train.columns if feature not in ['target', 'card_id', 'first_active_month']] categorical_features = [feature for feature in features if 'feature_' in feature] folds = KFold(n_splits=kfold, shuffle=True, random_state=42) clf_list = [] for train_idxs, val_idxs in folds.split(X_train.values, y_train.values): # training set train_set = Pool(data=X_train.iloc[train_idxs][features],\ label=y_train.iloc[train_idxs],\ cat_features=categorical_features) # validation set valid_set = Pool(data=X_train.iloc[val_idxs][features],\ label=y_train.iloc[val_idxs],\ cat_features=categorical_features) # train clf clf = train(pool=train_set,\ params=params,\ verbose=100,\ iterations=10000,\ eval_set=valid_set) # add current clf to clf_list clf_list.append(clf) return clf_list
def main(): print("load train test datasets") train, test = load_train_test() y_train_all = train['orderType'] id_test = test['userid'] del train['orderType'] df_columns = train.columns.values print('===> feature count: {}'.format(len(df_columns))) cat_params = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'learning_rate': 0.1, 'l2_leaf_reg': 5, # L2 regularization coefficient. 'subsample': 0.9, 'depth': 8, # Depth of the tree 'border_count': 255, # The number of splits for numerical features 'thread_count': 6, 'train_dir': 'catboost_train_logs', 'bootstrap_type': 'Bernoulli', 'use_best_model': True, 'random_seed': 42 } pool = Pool(train, y_train_all) cv_results = cat.train(pool=pool, params=cat_params, num_boost_round=4000, nfold=5, seed=42, stratified=True)
def _cv(self, dataset: tk.data.Dataset, folds: tk.validation.FoldsType) -> None: import catboost assert isinstance(dataset.data, pd.DataFrame) self.train_pool_ = catboost.Pool( data=dataset.data, label=dataset.labels, group_id=dataset.groups, feature_names=dataset.data.columns.values.tolist(), cat_features=dataset.data.select_dtypes("object").columns.values, ) self.gbms_, score_list = [], [] for fold, (train_indices, val_indices) in enumerate(folds): with tk.log.trace(f"fold{fold}"): gbm = catboost.train( params=self.params, pool=self.train_pool_.slice(train_indices), eval_set=self.train_pool_.slice(val_indices), **(self.cv_params or {}), ) self.gbms_.append(gbm) score_list.append(gbm.get_best_score()["validation"]) cv_weights = [len(val_indices) for _, val_indices in folds] evals: tk.evaluations.EvalsType = {} for k in score_list[0]: score = [s[k] for s in score_list] score = np.float32(np.average(score, weights=cv_weights)) evals[k] = score logger.info(f"cv: {tk.evaluations.to_str(evals)}")
def test_verbose_int(verbose): expected_line_count = {5: 3, False: 0, True: 10} pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def generate_submit(self, num_boost_round=None, from_model_saved=False): assert num_boost_round is not None if not from_model_saved: dtrain = self.get_train_set(as_cgb_pool=True) booster = cgb.train( dtrain=dtrain, params=self.params_best_fit, num_boost_round=num_boost_round) self.save_model(booster) else: booster = cgb.CatBoost(model_file=from_model_saved) dftest = self.get_test_set(as_cgb_pool=True) with Timer("Predicting"): probas = booster.predict(dftest, prediction_type="Probability") dfpred = pd.DataFrame(probas)[[1]] # Get proba classe one dfpred = dfpred.rename(columns={1: 'target'}) now = pd.Timestamp.now(tz='CET').strftime("%d-%Hh-%Mm") fpath = RESULT_DIR / "catboost_submit_{}.csv".format(now) with Timer('Storing in {}'.format(fpath)): dfpred.to_csv(fpath, index=False)
def train_model( df_train, df_valid, model_params, general_params, ): train_pool = catboost.Pool(df_train["text"].values, label=df_train["label"].values, text_features=[0]) valid_pool = catboost.Pool(df_valid["text"].values, label=df_valid["label"].values, text_features=[0]) model_params = copy.deepcopy(model_params) model_params.update({"train_dir": general_params["logdir"]}) model = catboost.train( pool=train_pool, eval_set=valid_pool, params=model_params, verbose=False, plot=False, ) model.save_model(os.path.join(general_params["logdir"], "model.cbm"))
def test_shap_complex_ctr(): pool = Pool([[0, 0, 0], [0, 1, 0], [1, 0, 1], [1, 1, 2]], [0, 0, 5, 8], cat_features=[0, 1, 2]) model = train(pool, {'random_seed': 12302113, 'iterations': 100}) shap_values = model.get_feature_importance(fstr_type=EFstrType.ShapValues, data=pool) predictions = model.predict(pool) assert(len(predictions) == len(shap_values)) for pred_idx in range(len(predictions)): assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9) np.savetxt(FIMP_TXT_PATH, shap_values) return local_canonical_file(FIMP_TXT_PATH)
def train(self, train_df, eval_df, params=None, use_best_eval=True): self.best_round = None dtrain = cat.Pool(data=train_df.drop(columns=[self.label]), label=train_df[self.label]) deval = cat.Pool(data=eval_df.drop(columns=[self.label]), label=eval_df[self.label]) if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) num_round = use_params.pop('num_round') if use_best_eval: with io.StringIO() as buf, redirect_stdout(buf): self.clf = cat.train(params=use_params, pool=dtrain, evals=deval, num_boost_round=num_round) output = buf.getvalue().split("\n") min_error = np.inf min_index = 0 for idx in range(1, num_round + 1): if len(output[idx].split("\t")) == 6: temp = 1 - float(output[idx].split("\t")[2].split(":")[1]) if min_error > temp: min_error = temp min_index = int(output[idx].split("\t")[0][:-1]) print("The minimum is attained in round %d" % (min_index + 1)) self.best_round = min_index + 1 return output else: with io.StringIO() as buf, redirect_stdout(buf): self.clf = cat.train(params=use_params, pool=dtrain, evals=deval, num_boost_round=num_round) output = buf.getvalue().split("\n") self.best_round = num_round return output
def train(self, X_train, X_valid, y_train, y_valid): """ ccc """ # Convert data to CatBoost Pool format. ds_train = ctb.Pool(X_train, y_train) ds_valid = ctb.Pool(X_valid, y_valid) # Set context dependent CatBoost parameters. self.params['dtrain'] = ds_train self.params['eval_set'] = ds_valid # Train using parameters sent by the user. return ctb.train(**self.params)
def cv_helper(one_hot_max_size,\ depth,\ l2_leaf_reg,\ random_strength,\ bagging_temperature): # entire date for evaluate clf training performance all_data = Pool(data=X_train[features],\ label=y_train,\ cat_features=categorical_features) # validation RMSE RMSE = [] for train_idxs, val_idxs in folds.split(X_train.values, y_train.values): # training set train_data = Pool(data=X_train.iloc[train_idxs][features],\ label=y_train.iloc[train_idxs],\ cat_features=categorical_features) # validation set val_data = Pool(data=X_train.iloc[val_idxs][features],\ label=y_train.iloc[val_idxs],\ cat_features=categorical_features) # hyperparameters params = { 'eval_metric': 'RMSE', 'use_best_model': True, 'loss_function': 'RMSE', 'learning_rate': 0.02, 'early_stopping_rounds': 400, 'border_count': 254, 'task_type': 'GPU', 'one_hot_max_size': int(one_hot_max_size), 'depth': int(depth), 'l2_leaf_reg': l2_leaf_reg, 'random_strength': random_strength, 'bagging_temperature': bagging_temperature } # classifier clf = train(pool=train_data,\ params=params,\ verbose=200,\ iterations=10000,\ eval_set=all_data) # add current fold RMSE on all_data RMSE.append(clf.best_score_['validation_0']['RMSE']) return -np.mean(np.array(RMSE))
def validate(self, save_model=True, **kwargs): dtrain, dtest = self.get_train_valid_set(as_cgb_pool=True) watchlist = [dtrain, dtest] booster = cgb.train( dtrain=dtrain, params=self.params_best_fit, eval_set=watchlist, **kwargs, ) if save_model: self.save_model(booster) return booster
def _train_model(self, data): print(self.params) dtrain = cb.Pool(data.X_train, data.y_train) if data.task == 'Ranking': dtrain.set_group_id(data.groups) start = time.time() self.model = cb.train( pool=dtrain, params=self.params, ) elapsed = time.time() - start return elapsed
def cgb_fit(config, X_train, y_train): """模型(交叉验证)训练,并返回最优迭代次数和最优的结果。 Args: config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path} X_train:array like, shape = n_sample * n_feature y_train: shape = n_sample * 1 Returns: best_model: 训练好的最优模型 best_auc: float, 在测试集上面的 AUC 值。 best_round: int, 最优迭代次数。 """ params = config.params max_round = config.max_round cv_folds = config.cv_folds seed = config.seed save_model_path = config.save_model_path if cv_folds is not None: dtrain = cgb.Pool(X_train, label=y_train) cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose') # 最优模型,最优迭代次数 auc_test_avg = cv_result['AUC_test_avg'] best_round = np.argmax(auc_test_avg) best_auc = np.max(auc_test_avg) # 最好的 auc 值 best_model = cgb.train(dtrain, params, num_boost_round=best_round) else: X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100) dtrain = cgb.Pool(X_train, label=y_train) dvalid = cgb.Pool(X_valid, label=y_valid) best_model = cgb.train(params, dtrain, num_boost_round=max_round, eval_set=dvalid) best_round = best_model.best_iteration best_auc = best_model.best_score cv_result = None if save_model_path: check_path(save_model_path) pickle.dump(best_model, open(save_model_path, 'wb')) return best_model, best_auc, best_round, cv_result
def fit(self, df, target, evals=None, early_stopping_rounds=None, verbose_eval=None, plot=False, **kwargs): '''Fit the CatBoostModel model given a DataFrame. This method accepts all key word arguments for the catboost.train method. :param df: A vaex DataFrame containing the training features. :param target: The column name of the target variable. :param evals: A list of DataFrames to be evaluated during training. This allows user to watch performance on the validation sets. :param int early_stopping_rounds: Activates early stopping. :param bool verbose_eval: Requires at least one item in *evals*. If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage. :param bool plot: if True, display an interactive widget in the Jupyter notebook of how the train and validation sets score on each boosting iteration. ''' # Ensure strings target = vaex.dataframe._ensure_string_from_expression(target) data = df[self.features].values target_data = df[target].values dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) if evals is not None: for i, item in enumerate(evals): data = item[self.features].values target_data = item[target].values evals[i] = catboost.Pool(data=data, label=target_data, **self.pool_params) # This does the actual training/fitting of the catboost model self.booster = catboost.train( params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs)
def test(self, X_test, y_test): """This function evaluates the model on paramters and estimators Parameters ---------- x_test: test set; y_test: test label""" self.cat = cb.train(params=self.params, pool=self.train_set) self.predictions = self.cat.predict(X_test, prediction_type="Probability") self.predictions = self.predictions[:, 1] self.y_test = y_test self.X_test = X_test print( "Model will be trained with best parameters obtained from your choice of optimization model ... \n\n\n" ) print( "Model trained with {} estimators on the following parameters: \n{}" .format(self.estimator, self.params))
def cat_stacking( self, target=None, ): X = self.train.drop(self.cols_to_drop, axis=1) y = self.train[target].values cat_models = [] predict_train = [] predict_test = [] predict_val = [] for fold, (train_ids, val_ids) in enumerate(self.folds.split(X, y)): dtrain = cat.Pool(X.iloc[train_ids], y[train_ids]) dval = cat.Pool(X.iloc[val_ids], y[val_ids]) model = cat.train(params=self.cat_params, dtrain=dtrain, eval_set=dval, verbose=200, early_stopping_rounds=100) pred = model.predict((cat.Pool(X.iloc[val_ids]))) predict_train = np.concatenate([pred, predict_train]) if self.mode == 'val': predict_val.append( model.predict( cat.Pool(self.val.drop(self.cols_to_drop, axis=1)))) if self.mode == 'test': predict_test.append( model.predict( cat.Pool(self.test.drop(self.drop_columns_test, axis=1)))) cat_models.append(model) clear_output() if self.mode == 'val': return predict_train, np.asarray(predict_val).mean(axis=0) if self.mode == 'test': return predict_train, np.asarray(predict_test).mean(axis=0)
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8') cat_train = Pool(X_train, y_train) cat_eval = Pool(X_eval, y_eval) cat_model = catboost.train(cat_train, self.params, iterations=10000, eval_set=cat_eval, early_stopping_rounds=200, verbose=500) y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1] ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def train_and_predict(self, features, target='label', save=True): self.fe = features X_train, y_train = self.train_[features].values, self.train_[target].values.astype('uint8') X_test = self.test_[self.fe].values cat_all = Pool(X_train, y_train) model = catboost.train( cat_all, self.params, iterations=10000, early_stopping_rounds=200, verbose=1000 ) self.model = model if save: save_pkl(model, os.path.join(self.opt['model_train'], 'cat.pkl')) self.pred = model.predict(X_test, prediction_type='Probability')[:, 1] return self.pred
def test_verbose_int(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) log_files = [] for i in range(3): log_files.append(JSON_LOG_PATH[:-5]+str(i)+JSON_LOG_PATH[-5:]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[0]}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[1]}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[2]}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) canonical_files = [] for log_file in log_files: canonical_files.append(local_canonical_file(remove_time_from_json(log_file))) return canonical_files
print('All Iterations') display(gs_summary) print('Best parameters: ') best_cv = gs_results.loc[gs_results['result'].idxmax()] display(best_cv) profile.End() print('Time elapsed: %s mins' % str(profile.ElapsedMinutes)) # Save CV process gs_summary.to_csv('../AllData_v2_CATBOOST_GS.csv') # Generate model by best iteration model = catb.train(params=params, pool=cat_train, num_boost_round=best_cv[1], logging_level="Verbose") # Save model for possible coded ensemble model.save_model('../AllData_v2_CATBOOST_Model') # Generate train prediction for future ensemble train_preds = model.predict(train_X, prediction_type="Probability") data = pd.read_csv('../input/application_train.csv') data['preds'] = train_preds[:, 1] data = data[['SK_ID_CURR', 'preds']] data.to_csv('../AllData_v2_CATBOOST_TrainPreds.csv', index=False) # Generate sub prediction for Kaggle sub_preds = model.predict(test_X, prediction_type="Probability")
def blending_test(train_x,train_y,val_x,val_y,test_x,weights=(0.38,0.18,0.14,0.1,0.20)): import random import lightgbm as lgb import xgboost as xgb import catboost as cat from sklearn.ensemble import RandomForestClassifier as rf from sklearn.linear_model import LogisticRegression as lr import gc res = 0 lgb_params = { # "max_bin": 512, "learning_rate": 0.01, "boosting_type": "gbdt", "objective": "binary", "metric": "auc", "num_leaves": 31, "max_depth": -1, "verbose": 200, "subsample": 0.8, "colsample_bytree": 0.9, "subsample_freq": 1, "reg_alpha": 0, "min_child_weight": 25, "random_state": random.randint(1,1000), "reg_lambda": 1, "n_jobs": -1, } d_train = lgb.Dataset(train_x, label=train_y) d_test = lgb.Dataset(val_x, label=val_y) clf_lgb = lgb.train(lgb_params, d_train, 3000, valid_sets=[d_train, d_test], early_stopping_rounds=100, verbose_eval=200) # temp_score_val = clf_lgb.best_score["valid_1"]["auc"] # temp_score_train = clf_lgb.best_score["training"]["auc"] # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val) temp_predict_lgb = clf_lgb.predict(test_x, num_iteration=clf_lgb.best_iteration) res+=temp_predict_lgb*weights[0] # weight = (temp_score_train + 2 * temp_score_val)/3 # res_temp_lgb = temp_predict_lgb * temp_predict_lgb * weight del d_test,d_train,clf_lgb,lgb_params cat_params = { "verbose": 200, "loss_function": "Logloss", "eval_metric": "AUC", "iterations": 1000, "random_seed": random.randint(1,1000), "learning_rate": 0.03, "depth": 6, "thread_count": 16, "use_best_model": True, "od_type": 'Iter', "od_wait": 30, } clf_cat = cat.train(pool=(train_x, train_y),params=cat_params,eval_set=[(train_x, train_y),(val_x, val_y)],verbose_eval=200) # temp_score_val = clf_cat.get_test_eval()[1] # temp_score_train = clf_cat.get_test_eval()[0] temp_predict_cat = clf_cat.predict(test_x,prediction_type="Probability")[:,1] res+=temp_predict_cat*weights[1] # weight = (temp_score_train + 2 * temp_score_val)/3 # res_temp_cat = temp_predict_cat * temp_predict_cat * weight del clf_cat,cat_params gc.collect() # xgb_params = { # "objective": "binary:logistic", # "eval_metric": "auc", # "n_estimators": 3000, # "booster":"gbtree", # "learning_rate": 0.05, # "max_depth": 6, # "n_jobs": -1, # "colsample_bytree": 0.9, # "subsample": 0.8, # "min_child_weight": 1, # "reg_lambda": 1, # } # clf_xgb = xgb.XGBClassifier(**xgb_params) # clf_xgb.fit(X=train_x, y=train_y, eval_set=[(train_x,train_y),(val_x,val_y)],eval_metric="auc",verbose=False,early_stopping_rounds=100) # temp_score_val = clf_xgb.best_score["valid_1"]["auc"] # temp_score_train = clf_xgb.best_score["training"]["auc"] # # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val) # temp_predict_xgb = clf_xgb.predict(test_x,ntree_limit=clf_xgb.best_ntree_limit) # weight = (temp_score_train + 2 * temp_score_val)/3 # res_temp_xgb = temp_predict_xgb * temp_predict_xgb * weight # del clf_xgb,temp_score_val,temp_score_train,temp_predict_xgb,xgb_params rf_ = rf( n_estimators=3000, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, min_impurity_decrease=0.00001, n_jobs=-1, verbose=200, random_state=random.randint(1,1000), warm_start=True, ) clf_rf = stop_early_wrapper(rf_,train_x,train_y,val_x, val_y,n_min_iterations=100,scale=1.0001,eval_metric="auc") temp_predict_rf = clf_rf.predict_proba(test_x)[:,1] res+=temp_predict_rf*weights[2] # weight = clf_rf.best_score # res_temp_rf = temp_predict_rf * temp_predict_rf * weight del clf_rf,rf_, gc.collect() for f in test_x.columns: train_x[f] = (train_x[f] - train_x[f].min()) / (train_x[f].max() - train_x[f].min()) val_x[f] = (val_x[f] - val_x[f].min()) / (val_x[f].max() - val_x[f].min()) test_x[f] = (test_x[f] - test_x[f].min()) / (test_x[f].max() - test_x[f].min()) clf_lr = lr(penalty='l2', dual=False, tol=0.00001, C=0.06, random_state=random.randint(1,1000), solver='liblinear', max_iter=2000, verbose=200) temp_train = pd.concat([train_x,val_x],axis=0) temp_y = pd.concat([train_y,val_y],axis=0) clf_lr.fit(temp_train,temp_y) print("validation score of lr",clf_lr.score(val_x,val_y)) # val_y_hat = clf_lr.predict_proba(test_x)[:, 1] # from sklearn.metrics import roc_auc_score # weight = roc_auc_score(val_y, val_y_hat) temp_predict_lr = clf_rf.predict_proba(test_x)[:, 1] res+=temp_predict_lr*weights[3] # res_temp_lr = temp_predict_lr * temp_predict_lr * weight del clf_lr,temp_train,temp_y gc.collect() from keras.callbacks import ModelCheckpoint from keras.callbacks import EarlyStopping clf_nn = KerasClassifier_wrapper(train_x.shape[1]) model_path = "../input/keras_model.h5" callbacks = [ EarlyStopping( monitor='val_auc', patience=20, mode='max', verbose=100), ModelCheckpoint( model_path, monitor='val_auc', save_best_only=True, mode='max', verbose=100) ] # fit estimator history = clf_nn.fit( train_x, train_y, epochs=1000, batch_size=1024, validation_data=(val_x, val_y), verbose=1, callbacks=callbacks, shuffle=True ) print(history.history.keys()) import matplotlib.pyplot as plt # summarize history for R^2 fig_acc = plt.figure(figsize=(10, 10)) plt.plot(history.history['auc']) plt.plot(history.history['val_auc']) plt.title('model auc') plt.ylabel('auc') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() fig_acc.savefig("model_auc.png") # summarize history for loss fig_loss = plt.figure(figsize=(10, 10)) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() fig_loss.savefig("model_loss.png") temp_predict_nn = clf_nn.predict_proba(test_x)[:,1] res+=temp_predict_nn*weights[4] del history,clf_nn,train_x,train_y,val_x,val_y,test_x gc.collect() return res
def fit(self, df, evals=None, early_stopping_rounds=None, verbose_eval=None, plot=False, progress=None, **kwargs): '''Fit the CatBoostModel model given a DataFrame. This method accepts all key word arguments for the catboost.train method. :param df: A vaex DataFrame containing the features and target on which to train the model. :param evals: A list of DataFrames to be evaluated during training. This allows user to watch performance on the validation sets. :param int early_stopping_rounds: Activates early stopping. :param bool verbose_eval: Requires at least one item in *evals*. If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage. :param bool plot: if True, display an interactive widget in the Jupyter notebook of how the train and validation sets score on each boosting iteration. :param progress: If True display a progressbar when the training is done in batches. ''' self.pool_params['feature_names'] = self.features if evals is not None: for i, item in enumerate(evals): data = item[self.features].values target_data = item[self.target].to_numpy() evals[i] = catboost.Pool(data=data, label=target_data, **self.pool_params) # This does the actual training/fitting of the catboost model if self.batch_size is None: data = df[self.features].values target_data = df[self.target].to_numpy() dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.booster = model self.evals_result_ = [model.evals_result_] self.feature_importances_ = list(model.feature_importances_) else: models = [] # Set up progressbar n_samples = len(df) progressbar = vaex.utils.progressbars(progress) column_names = self.features + [self.target] iterator = df[column_names].to_pandas_df( chunk_size=self.batch_size) for i1, i2, chunk in iterator: progressbar(i1 / n_samples) data = chunk[self.features].values target_data = chunk[self.target].values dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train( params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.evals_result_.append(model.evals_result_) models.append(model) progressbar(1.0) # Weights are key when summing models if len(self.batch_weights) == 0: batch_weights = [1 / len(models)] * len(models) elif self.batch_weights is not None and len( self.batch_weights) != len(models): raise ValueError( "'batch_weights' must be te same length as the number of models." ) else: batch_weights = self.batch_weights # Sum the models self.booster = catboost.sum_models( models, weights=batch_weights, ctr_merge_policy=self.ctr_merge_policy)
'max_depth': 10, 'num_rounds': 200, } #score :0.62228 @ 0.01 , split = 0.9, random_state=7 #score :0.62213 @ 0.01 , split = 0.8, random_state=7 #score :0.62113 @ 0.01 , split = 0.7, random_state=7 model_xgb = xgb.train(params_xgb, d_train_xgb, 105, watchlist_xgb, early_stopping_rounds=20, maximize=True, verbose_eval=10) model_lgb = lgb.train(params_lgb, d_train_lgb, 100, valid_sets_lgb) model_cat = cat.train(pool, params=params_cat, logging_level='Verbose') p_train_xgb = model_xgb.predict(d_valid_xgb) p_train_lgb = model_lgb.predict(X_test) p_train_cat = model_cat.predict(X_test) p_test_xgb = model_xgb.predict(d_test_xgb) p_test_lgb = model_lgb.predict(real_test_data) p_test_cat = model_cat.predict(real_test_data) final_train = p_train_xgb.reshape(-1, 1) final_test = p_test_xgb.reshape(-1, 1) final_train = np.concatenate((final_train, p_train_lgb.reshape(-1, 1)), axis=1) final_test = np.concatenate((final_test, p_test_lgb.reshape(-1, 1)), axis=1) final_train = np.concatenate((final_train, p_train_cat.reshape(-1, 1)), axis=1) final_test = np.concatenate((final_test, p_test_cat.reshape(-1, 1)), axis=1)
def main(options): print("load train test datasets") train_all, y_train_all, id_train, test, id_test = pre_train() cat_params = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'learning_rate': options.learning_rate, 'l2_leaf_reg': options.l2_leaf_reg, # L2 regularization coefficient. 'subsample': options.subsample, 'depth': options.depth, # Depth of the tree 'border_count': 255, # The number of splits for numerical features 'thread_count': 6, 'train_dir': 'catboost_train_logs', 'bootstrap_type': 'Bernoulli', 'use_best_model': True, 'random_seed': options.seed } roof_flod = options.roof_flod kf = StratifiedKFold(n_splits=roof_flod, shuffle=True, random_state=options.seed) pred_train_full = np.zeros(train_all.shape[0]) pred_test_full = 0 cv_scores = [] predict_feature = 'catboost_predict_roof_fold{}_lr{}_l2_leaf_reg{}_subsample{}_depth{}_seed{}'.format( options.roof_flod, options.learning_rate, options.l2_leaf_reg, options.subsample, options.depth, options.seed) print('params info:', predict_feature) for i, (dev_index, val_index) in enumerate(kf.split(train_all, y_train_all)): print( '========== perform fold {}, train size: {}, validate size: {} ==========' .format(i, len(dev_index), len(val_index))) train_x, val_x = train_all.ix[dev_index], train_all.ix[val_index] train_y, val_y = y_train_all[dev_index], y_train_all[val_index] model = cat.train(pool=Pool(train_x, train_y), params=cat_params, iterations=460, eval_set=(val_x, val_y), verbose=False) # predict validate predict_valid = model.predict(val_x.values, prediction_type='Probability')[:, 1] valid_auc = evaluate_score(predict_valid, val_y) # predict test predict_test = model.predict(test.values, prediction_type='Probability')[:, 1] print('valid_auc = {}'.format(valid_auc)) cv_scores.append(valid_auc) # run-out-of-fold predict pred_train_full[val_index] = predict_valid pred_test_full += predict_test mean_cv_scores = np.mean(cv_scores) print('Mean cv auc:', mean_cv_scores) print("saving train predictions for ensemble") train_pred_df = pd.DataFrame({'userid': id_train}) train_pred_df[predict_feature] = pred_train_full train_pred_df.to_csv( "./ensemble/train/catboost/hl_cat_roof{}_predict_train_cv{}_{}.csv". format(roof_flod, mean_cv_scores, predict_feature), index=False, columns=['userid', predict_feature]) print("saving test predictions for ensemble") pred_test_full = pred_test_full / float(roof_flod) test_pred_df = pd.DataFrame({'userid': id_test}) test_pred_df[predict_feature] = pred_test_full test_pred_df.to_csv( "./ensemble/test/catboost/hl_cat_roof{}_predict_test_cv{}_{}.csv". format(roof_flod, mean_cv_scores, predict_feature), index=False, columns=['userid', predict_feature])
def get_feature_importance(data, target, clf='lightgbm', shuffle=False): ''' Parameters ---------- data: input dataset, type of dataframe target: input target dataset, type of series clf: the name of model want to use, type of string shuffle: whether to shuffle target dataset (for getting null importance) Return ------ importance_df: importance of each features, type of dataframe, shape(n_feature, n_importance) ''' # feature list train_features = [feature for feature in data.columns.values if feature not in ['target', 'card_id', 'first_active_month']] categorical_features = [feature for feature in train_features if 'feature_' in feature] # shuffle the data y = target.copy().sample(frac=1.0) if shuffle else target.copy() # using lightgbm if clf == 'lightgbm': import lightgbm as lgb import pandas as pd # construct training date train_data = lgb.Dataset(data=data[train_features],\ label=y,\ free_raw_data=False) # model hyperparameters lgb_params = { 'num_leaves': 129, 'min_data_in_leaf': 148, 'objective': 'regression', 'max_depth': 9, 'learning_rate': 0.005, 'min_child_samples': 24, 'boosting': 'gbdt', 'feature_fraction': 0.7202, 'bagging_freq': 1, 'bagging_fraction': 0.8125, 'bagging_seed': 11, 'metric': 'rmse', 'lambda_l1': 0.3468, 'random_state': 133, 'verbosity': -1 } # training the model clf_lgb = lgb.train(params=lgb_params,\ train_set=train_data,\ num_boost_round=850) # calculate importance importance_df = pd.DataFrame() importance_df['feature'] = list(train_features) importance_df['importance_gain'] =\ clf_lgb.feature_importance(importance_type='gain') importance_df['importance_split'] =\ clf_lgb.feature_importance(importance_type='split') return importance_df if clf == 'catboost': from catboost import train, Pool, EFstrType import pandas as pd # construct training data train_data = Pool(data=data[train_features],\ label=y) # model hyperparameters cat_params = { 'loss_function': 'RMSE', 'learning_rate': 0.02, 'early_stopping_rounds': 400, 'border_count': 254, 'task_type': 'GPU', 'one_hot_max_size': 6, 'depth': 11, 'l2_leaf_reg': 1.0, 'random_strength': 1.9574, 'bagging_temperature': 20.9049 } # training the model clf_cat = train(pool=train_data,\ params=cat_params,\ verbose=False,\ iterations=1000) # calculate feature importance importance_df = pd.DataFrame() importance_df['feature'] = list(train_features) importance_df['PredictionValuesChange'] =\ clf_cat.get_feature_importance(type='PredictionValuesChange') return importance_df