예제 #1
0
    def fit(
        self,
        dataset: DatasetH,
        num_boost_round=1000,
        early_stopping_rounds=50,
        verbose_eval=20,
        evals_result=dict(),
        reweighter=None,
        **kwargs
    ):
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        if df_train.empty or df_valid.empty:
            raise ValueError("Empty data from dataset, please check your dataset config.")
        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        # CatBoost needs 1D array as its label
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
        else:
            raise ValueError("CatBoost doesn't support multi-label training")

        if reweighter is None:
            w_train = None
            w_valid = None
        elif isinstance(reweighter, Reweighter):
            w_train = reweighter.reweight(df_train).values
            w_valid = reweighter.reweight(df_valid).values
        else:
            raise ValueError("Unsupported reweighter type.")

        train_pool = Pool(data=x_train, label=y_train_1d, weight=w_train)
        valid_pool = Pool(data=x_valid, label=y_valid_1d, weight=w_valid)

        # Initialize the catboost model
        self._params["iterations"] = num_boost_round
        self._params["early_stopping_rounds"] = early_stopping_rounds
        self._params["verbose_eval"] = verbose_eval
        self._params["task_type"] = "GPU" if get_gpu_device_count() > 0 else "CPU"
        self.model = CatBoost(self._params, **kwargs)

        # train the model
        self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs)

        evals_result = self.model.get_evals_result()
        evals_result["train"] = list(evals_result["learn"].values())[0]
        evals_result["valid"] = list(evals_result["validation"].values())[0]
예제 #2
0
class CatBoost:
    _verbose = 200
    _train_dir = DATA_CACHE_DIR
    _is_gpu_available = get_gpu_device_count()
    _task_type = "GPU" if _is_gpu_available > 0 else None
    _devices = "GPU" if _is_gpu_available > 0 else None

    def __init__(self, model_id, num_input_features, num_output_classes,
                 model_save_path, **aux_params):
        self.model = CatBoostClassifier(loss_function="MultiClass",
                                        task_type=self._task_type,
                                        devices=self._devices,
                                        train_dir=self._train_dir,
                                        random_seed=SEED)
        self.model.set_params(**aux_params)
        self.model_id = model_id

        path = f"{model_save_path}/{model_id}"
        os.makedirs(path, exist_ok=True)
        self.model_path = path
        self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME)

    def load(self):
        self.model.load_model(self.modelfile_save_path)

    def save(self):
        self.model.save_model(self.modelfile_save_path)

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(Pool(X_train, y_train),
                       eval_set=(X_valid, y_valid),
                       use_best_model=True,
                       verbose=self._verbose)
        self.save()

    def predict(self, X, load=False):
        if load:
            self.load()
        return self.model.predict_proba(X)

    def explain(self, X_train, y_train, features, classes):
        importances = self.model.get_feature_importance(
            data=Pool(X_train, y_train))
        plot_importance(importances, features, self.model_path, self.model_id)
예제 #3
0
    def fit(self,
            dataset: DatasetH,
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=20,
            evals_result=dict(),
            **kwargs):
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        # CatBoost needs 1D array as its label
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(
                y_valid.values)
        else:
            raise ValueError("CatBoost doesn't support multi-label training")

        train_pool = Pool(data=x_train, label=y_train_1d)
        valid_pool = Pool(data=x_valid, label=y_valid_1d)

        # Initialize the catboost model
        self._params["iterations"] = num_boost_round
        self._params["early_stopping_rounds"] = early_stopping_rounds
        self._params["verbose_eval"] = verbose_eval
        self._params["task_type"] = "GPU" if get_gpu_device_count(
        ) > 0 else "CPU"
        self.model = CatBoost(self._params, **kwargs)

        # train the model
        self.model.fit(train_pool,
                       eval_set=valid_pool,
                       use_best_model=True,
                       **kwargs)

        evals_result = self.model.get_evals_result()
        evals_result["train"] = list(evals_result["learn"].values())[0]
        evals_result["valid"] = list(evals_result["validation"].values())[0]
예제 #4
0
# https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db

import numpy as np
from catboost.utils import get_gpu_device_count
import xlsxwriter
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import catboost as cb
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
#import pickle

print('I see %i GPU devices' % get_gpu_device_count())
start_time = time.time()

data = pd.read_excel(open('merged final data.xlsx', 'rb'),
                     sheet_name='Sheet1')
data = data.replace(['Fully paid', 'Charged Off'], [0, 1])
# data['CreditScore'] = np.log(data['CreditScore'])
data.drop(['CreditScore'], axis=1, inplace=True)

data.drop(['Avg_cur_bal'], axis=1, inplace=True)  # Tot_hi_cred_lim
data.drop(['Bc_util'], axis=1, inplace=True)  # Revol_util
data.drop(['Total_bc_limit'], axis=1, inplace=True)  # Total_rev_hi_lim

train, test, y_train, y_test = train_test_split(data.drop(["LoanStatus"], axis=1), data["LoanStatus"],
                                                random_state=10, test_size=0.30)
params = {'depth': [2, 6],
예제 #5
0
    return os.path.join(workdir, f'{filename}-{today}.{fileext}')


# 測試資料比例
# test_size_list = [ .25 ]
test_size_list = [
    .05, .1, .15, .2, .25, .3, .35, .4, .45, .5, .55, .6, .65, .7, .75, .80,
    .85, .90, .95
]

# 隨機種子
random_seed_list = [3, 106, 2019]
# random_seed_list = [3]

# 檢查是否有 GPU 支援
task_type = "GPU" if get_gpu_device_count() > 0 else "CPU"

model_list = [
    BernoulliNB(),
    CatBoostClassifier(task_type=task_type),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(n_jobs=-1),
    GaussianNB(),
    GradientBoostingClassifier(),
    GaussianProcessClassifier(multi_class='one_vs_rest',
                              n_jobs=-1),  # Out of memory
    KNeighborsClassifier(n_jobs=-1),
    LGBMClassifier(n_jobs=-1),
    # LabelSpreading(n_jobs = -1),          # Out of Memory
    LinearDiscriminantAnalysis(),
예제 #6
0
import numpy as np
from catboost.utils import get_gpu_device_count

params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'thread_count': -1,
    'custom_metric': ['AUC:hints=skip_train~false', 'F1'],
    'task_type': 'CPU' if get_gpu_device_count() > 0 else 'CPU',
    # 'task_type': 'GPU', # if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',  # 'SymmetricTree',  #  'Depthwise',
    # 'auto_class_weights': 'Balanced',
    'langevin': True,  # CPU only
    'iterations': 20,
    'learning_rate': 0.002,  # 4e-3,
    'l2_leaf_reg': 1e-1,
    'depth': 16,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 1,
    'od_type': 'Iter',
    'od_wait': 100,
    # 'early_stopping_rounds': 100,

    # random control
    'bootstrap_type': 'Bayesian',
    # 'random_seed': 100,
    'random_strength': 0.001,
    'rsm': 1,
    'bagging_temperature': 0,
    'boosting_type': 'Plain',  # 'Ordered'