def catBoost(X_train, X_test, y_train, y_test, tripid_test):
    print("Catboost")
    ##    eval_pool = Pool(X_test, y_test) #pool for eval_set
    train_pool = Pool(X_train, y_train)
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    ##    print(X_train.dtypes)
    ##    print(categorical_features_indices)
    ##    incorrect_sum = 0
    ##    correct_sum = 0
    ##    for val in y_train:
    ##      if val == 0:
    ##        incorrect_sum+=1
    ##      else:
    ##        correct_sum+=1
    ##    print(incorrect_sum)
    ##    print(correct_sum)
    ##    print(categorical_features_indices)
    ##    weight = incorrect_sum/correct_sum #to handle imbalanced nature
    ##    model5 = CatBoostClassifier(iterations=310, depth=3, learning_rate=0.408)
    ##    model5 = CatBoostClassifier(scale_pos_weight=weight, iterations=136, verbose=100)0.9682411736256651
    ##    model5 = CatBoostClassifier(iterations = 496,l2_leaf_reg = 3, verbose=100) #272 206 496

    ##    model5 = CatBoostClassifier(iterations = 4000, verbose=100)

    ##    model5.fit(X_train, y_train, eval_set=eval_pool, early_stopping_rounds=1000)
    model5 = CatBoostClassifier(iterations=489, verbose=100)
    model5.fit(X_train, y_train, cat_features=categorical_features_indices)

    y_pred = model5.predict(X_test)
    ##    print(f1_score(y_test,y_pred))
    ####
    ##    fea_imp = pd.DataFrame({'imp': model5.feature_importances_, 'col': X_train.columns})
    ##    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
    ##    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
    ##    plt.title('CatBoost - Feature Importance')
    ##    plt.ylabel('Features')
    ##    plt.xlabel('Importance')
    ##    pyplot.show()

    data = np.column_stack([tripid_test, y_pred])
    label = ["tripid", "prediction"]
    frame = pd.DataFrame(data, columns=label)
    file_path = "./catboost_output.csv"
    with open(file_path, mode='w', newline='\n') as f:
        frame.to_csv(f, float_format='%.2f', index=False, header=True)


##
# Look at parameters used by our current forest
##    arr = model5.get_feature_importance()
##    print(arr)
    print("Output file 'catboost_output.csv' is created")
    print('Parameters currently in use:\n')
    print(model5.get_all_params())
Example #2
0
class CatBoostAlgo(_BaseAlgo):

    # Constructor ---------------------------------------------------------
    def __init__ (self, 
                 tune_props=None, 
                 persist_props=None,
                 **kwargs): 
        super ().__init__(
            tune_props = tune_props,
            persist_props = persist_props,
            **kwargs
        )
    
    # Init/Parameters functions -------------------------------------------
    def _init_model(self, **kwargs):
        self._model_name = MODEL_CAT
        self._model = CatBoostClassifier(**kwargs)

    # init/parameters functions -------------------------------------------
    def get_params(self):
        return self._model.get_all_params()
(thresholds, fnr) = get_fnr_curve(curve=curve)
plt.figure(figsize=(16, 8))
lw = 2
plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
#plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()
#find threshold
from catboost.utils import select_threshold
print(select_threshold(model=model, data=eval_train_pool, FNR=0.2))
print(select_threshold(model=model, data=eval_train_pool, FPR=0.4))
#confusion matrix
print(get_confusion_matrix(model, data=eval_pool))
from catboost.utils import get_confusion_matrix
#result show
test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)
from catboost import Pool
model.get_all_params()  #params

model.eval_metrics(data=eval_pool, metrics='Recall')
model.score(test_pool)
result = model.predict_proba(eval_test_pool)
 """
 print("CatBoost OOB...")
 clf = CatBoostClassifier(custom_metric="F1",
                          eval_metric="F1",
                          random_seed=42,
                          thread_count=4,
                          verbose=False)
 clf.fit(X_train,
         y_train.values.ravel(),
         cat_features=cat_cols + encode_cols)
 cf_predict = clf.predict(X_test)
 cf_proba = clf.predict_proba(X_test)
 with open(DIR_OUTPUT + "/results.txt", "a") as cb:
     cb.write("\n \n------------- CatBoost OOB ------------- \n")
     cb.write("ROC-AUC score: {} using {} \n".format(
         roc_auc_score(y_test, cf_proba[:, 1]), clf.get_all_params()))
     cb.write("F1 score for CatBoostClassifier: {:.3f} \n".format(
         metrics.f1_score(y_test, cf_predict)))
     cb.write("Accuracy for reports: {:.3f} \n".format(
         report_accuracy(y_test, cf_predict)))
     cb.write("Accuracy for most likely rootcauses: {:.3f} \n".format(
         most_likely_error_accuracy(y_test, cf_proba)))
     cb.write(np.array2string(metrics.confusion_matrix(y_test, cf_predict)))
 """
 CatBoost hyperopt
 """
 print("CatBoost hyperopt...")
 space4cb = {
     'depth': hp.choice('depth', range(1, 15)),
     'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
     'l2_leaf_reg': hp.choice('l2_leaf_reg', range(1, 10)),
Example #5
0
    def train_1fold(self, fold, params, params_custom):
        X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold)

        cat_feature_idx = []
        for i, c in enumerate(X_train):
            if not is_numeric_dtype(X_train[c]):
                cat_feature_idx.append(i)

        if fold == 0:
            X_train.dtypes.to_csv(self.models_path + "/dtypes.csv")
            logger.info(f"X_train.shape = {X_train.shape}")

        params2 = copy.deepcopy(params)
        if params2["random_seed"] is not None:
            params2["random_seed"] = params2["random_seed"] + fold
            logger.info(f"Set catboost train random_seed = {params2['random_seed']}")

        model = CatBoostClassifier(**params2)

        model.fit(
            X_train, y_train,
            cat_features=cat_feature_idx,
            eval_set=(X_valid, y_valid)
        )

        model.save_model(self.models_path + f'/model-catboost-f{fold:02d}.bin')
        util.dump_json(model.get_all_params(), self.models_path + "/params.json")

        evals = model.get_evals_result()
        evals_df = pd.DataFrame({
            f"logloss_train_f{fold:02d}":evals["learn"]['Logloss'],
            f"accuracy_train_f{fold:02d}":evals["learn"]['Accuracy'],
            f"logloss_valid_f{fold:02d}":evals['validation']['Logloss'],
            f"accuracy_valid_f{fold:02d}":evals['validation']['Accuracy']
        })
        self.evals_df.append(evals_df)

        preds_valid = model.predict_proba(X_valid)[:,1]
        logger.info(f"len(vdx)={len(vdx)} len(preds_valid)={len(preds_valid)}")
        self.preds_valid_all.loc[vdx, "pred"] = preds_valid

        preds_train = model.predict_proba(X_train)[:,1]
        self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx))

        preds_test = model.predict_proba(X_test)[:,1]
        self.preds_test_all.append(preds_test)

        acc_valid = accuracy_score(y_valid, np.round(preds_valid))
        acc_train = accuracy_score(y_train, np.round(preds_train))
        logloss_valid = log_loss(y_valid, preds_valid)
        logloss_train = log_loss(y_train, preds_train)

        ms = [fold, acc_train, acc_valid, logloss_train, logloss_valid, model.get_best_iteration()]
        self.mets.append(ms)
        show_mets(*ms)

        for it in ["FeatureImportance"]:
            imp = pd.Series(model.get_feature_importance(type=it), index=X_train.columns)
            imp.name = fold
            imp.index.name = "feature"
            self.importance[it].append(imp)