def catBoost(X_train, X_test, y_train, y_test, tripid_test): print("Catboost") ## eval_pool = Pool(X_test, y_test) #pool for eval_set train_pool = Pool(X_train, y_train) categorical_features_indices = np.where(X_train.dtypes != np.float)[0] ## print(X_train.dtypes) ## print(categorical_features_indices) ## incorrect_sum = 0 ## correct_sum = 0 ## for val in y_train: ## if val == 0: ## incorrect_sum+=1 ## else: ## correct_sum+=1 ## print(incorrect_sum) ## print(correct_sum) ## print(categorical_features_indices) ## weight = incorrect_sum/correct_sum #to handle imbalanced nature ## model5 = CatBoostClassifier(iterations=310, depth=3, learning_rate=0.408) ## model5 = CatBoostClassifier(scale_pos_weight=weight, iterations=136, verbose=100)0.9682411736256651 ## model5 = CatBoostClassifier(iterations = 496,l2_leaf_reg = 3, verbose=100) #272 206 496 ## model5 = CatBoostClassifier(iterations = 4000, verbose=100) ## model5.fit(X_train, y_train, eval_set=eval_pool, early_stopping_rounds=1000) model5 = CatBoostClassifier(iterations=489, verbose=100) model5.fit(X_train, y_train, cat_features=categorical_features_indices) y_pred = model5.predict(X_test) ## print(f1_score(y_test,y_pred)) #### ## fea_imp = pd.DataFrame({'imp': model5.feature_importances_, 'col': X_train.columns}) ## fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:] ## fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None) ## plt.title('CatBoost - Feature Importance') ## plt.ylabel('Features') ## plt.xlabel('Importance') ## pyplot.show() data = np.column_stack([tripid_test, y_pred]) label = ["tripid", "prediction"] frame = pd.DataFrame(data, columns=label) file_path = "./catboost_output.csv" with open(file_path, mode='w', newline='\n') as f: frame.to_csv(f, float_format='%.2f', index=False, header=True) ## # Look at parameters used by our current forest ## arr = model5.get_feature_importance() ## print(arr) print("Output file 'catboost_output.csv' is created") print('Parameters currently in use:\n') print(model5.get_all_params())
class CatBoostAlgo(_BaseAlgo): # Constructor --------------------------------------------------------- def __init__ (self, tune_props=None, persist_props=None, **kwargs): super ().__init__( tune_props = tune_props, persist_props = persist_props, **kwargs ) # Init/Parameters functions ------------------------------------------- def _init_model(self, **kwargs): self._model_name = MODEL_CAT self._model = CatBoostClassifier(**kwargs) # init/parameters functions ------------------------------------------- def get_params(self): return self._model.get_all_params()
(thresholds, fnr) = get_fnr_curve(curve=curve) plt.figure(figsize=(16, 8)) lw = 2 plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5) plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.grid(True) plt.xlabel('Threshold', fontsize=16) plt.ylabel('Error Rate', fontsize=16) #plt.title('FPR-FNR curves', fontsize=20) plt.legend(loc="lower left", fontsize=16) plt.show() #find threshold from catboost.utils import select_threshold print(select_threshold(model=model, data=eval_train_pool, FNR=0.2)) print(select_threshold(model=model, data=eval_train_pool, FPR=0.4)) #confusion matrix print(get_confusion_matrix(model, data=eval_pool)) from catboost.utils import get_confusion_matrix #result show test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) from catboost import Pool model.get_all_params() #params model.eval_metrics(data=eval_pool, metrics='Recall') model.score(test_pool) result = model.predict_proba(eval_test_pool)
""" print("CatBoost OOB...") clf = CatBoostClassifier(custom_metric="F1", eval_metric="F1", random_seed=42, thread_count=4, verbose=False) clf.fit(X_train, y_train.values.ravel(), cat_features=cat_cols + encode_cols) cf_predict = clf.predict(X_test) cf_proba = clf.predict_proba(X_test) with open(DIR_OUTPUT + "/results.txt", "a") as cb: cb.write("\n \n------------- CatBoost OOB ------------- \n") cb.write("ROC-AUC score: {} using {} \n".format( roc_auc_score(y_test, cf_proba[:, 1]), clf.get_all_params())) cb.write("F1 score for CatBoostClassifier: {:.3f} \n".format( metrics.f1_score(y_test, cf_predict))) cb.write("Accuracy for reports: {:.3f} \n".format( report_accuracy(y_test, cf_predict))) cb.write("Accuracy for most likely rootcauses: {:.3f} \n".format( most_likely_error_accuracy(y_test, cf_proba))) cb.write(np.array2string(metrics.confusion_matrix(y_test, cf_predict))) """ CatBoost hyperopt """ print("CatBoost hyperopt...") space4cb = { 'depth': hp.choice('depth', range(1, 15)), 'learning_rate': hp.uniform('learning_rate', 0.001, 0.3), 'l2_leaf_reg': hp.choice('l2_leaf_reg', range(1, 10)),
def train_1fold(self, fold, params, params_custom): X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold) cat_feature_idx = [] for i, c in enumerate(X_train): if not is_numeric_dtype(X_train[c]): cat_feature_idx.append(i) if fold == 0: X_train.dtypes.to_csv(self.models_path + "/dtypes.csv") logger.info(f"X_train.shape = {X_train.shape}") params2 = copy.deepcopy(params) if params2["random_seed"] is not None: params2["random_seed"] = params2["random_seed"] + fold logger.info(f"Set catboost train random_seed = {params2['random_seed']}") model = CatBoostClassifier(**params2) model.fit( X_train, y_train, cat_features=cat_feature_idx, eval_set=(X_valid, y_valid) ) model.save_model(self.models_path + f'/model-catboost-f{fold:02d}.bin') util.dump_json(model.get_all_params(), self.models_path + "/params.json") evals = model.get_evals_result() evals_df = pd.DataFrame({ f"logloss_train_f{fold:02d}":evals["learn"]['Logloss'], f"accuracy_train_f{fold:02d}":evals["learn"]['Accuracy'], f"logloss_valid_f{fold:02d}":evals['validation']['Logloss'], f"accuracy_valid_f{fold:02d}":evals['validation']['Accuracy'] }) self.evals_df.append(evals_df) preds_valid = model.predict_proba(X_valid)[:,1] logger.info(f"len(vdx)={len(vdx)} len(preds_valid)={len(preds_valid)}") self.preds_valid_all.loc[vdx, "pred"] = preds_valid preds_train = model.predict_proba(X_train)[:,1] self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx)) preds_test = model.predict_proba(X_test)[:,1] self.preds_test_all.append(preds_test) acc_valid = accuracy_score(y_valid, np.round(preds_valid)) acc_train = accuracy_score(y_train, np.round(preds_train)) logloss_valid = log_loss(y_valid, preds_valid) logloss_train = log_loss(y_train, preds_train) ms = [fold, acc_train, acc_valid, logloss_train, logloss_valid, model.get_best_iteration()] self.mets.append(ms) show_mets(*ms) for it in ["FeatureImportance"]: imp = pd.Series(model.get_feature_importance(type=it), index=X_train.columns) imp.name = fold imp.index.name = "feature" self.importance[it].append(imp)