def fit( self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), reweighter=None, **kwargs ): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) if df_train.empty or df_valid.empty: raise ValueError("Empty data from dataset, please check your dataset config.") x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] # CatBoost needs 1D array as its label if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) else: raise ValueError("CatBoost doesn't support multi-label training") if reweighter is None: w_train = None w_valid = None elif isinstance(reweighter, Reweighter): w_train = reweighter.reweight(df_train).values w_valid = reweighter.reweight(df_valid).values else: raise ValueError("Unsupported reweighter type.") train_pool = Pool(data=x_train, label=y_train_1d, weight=w_train) valid_pool = Pool(data=x_valid, label=y_valid_1d, weight=w_valid) # Initialize the catboost model self._params["iterations"] = num_boost_round self._params["early_stopping_rounds"] = early_stopping_rounds self._params["verbose_eval"] = verbose_eval self._params["task_type"] = "GPU" if get_gpu_device_count() > 0 else "CPU" self.model = CatBoost(self._params, **kwargs) # train the model self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs) evals_result = self.model.get_evals_result() evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0]
class CatBoost: _verbose = 200 _train_dir = DATA_CACHE_DIR _is_gpu_available = get_gpu_device_count() _task_type = "GPU" if _is_gpu_available > 0 else None _devices = "GPU" if _is_gpu_available > 0 else None def __init__(self, model_id, num_input_features, num_output_classes, model_save_path, **aux_params): self.model = CatBoostClassifier(loss_function="MultiClass", task_type=self._task_type, devices=self._devices, train_dir=self._train_dir, random_seed=SEED) self.model.set_params(**aux_params) self.model_id = model_id path = f"{model_save_path}/{model_id}" os.makedirs(path, exist_ok=True) self.model_path = path self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME) def load(self): self.model.load_model(self.modelfile_save_path) def save(self): self.model.save_model(self.modelfile_save_path) def fit(self, X_train, y_train, X_valid, y_valid): self.model.fit(Pool(X_train, y_train), eval_set=(X_valid, y_valid), use_best_model=True, verbose=self._verbose) self.save() def predict(self, X, load=False): if load: self.load() return self.model.predict_proba(X) def explain(self, X_train, y_train, features, classes): importances = self.model.get_feature_importance( data=Pool(X_train, y_train)) plot_importance(importances, features, self.model_path, self.model_id)
def fit(self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), **kwargs): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] # CatBoost needs 1D array as its label if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze( y_valid.values) else: raise ValueError("CatBoost doesn't support multi-label training") train_pool = Pool(data=x_train, label=y_train_1d) valid_pool = Pool(data=x_valid, label=y_valid_1d) # Initialize the catboost model self._params["iterations"] = num_boost_round self._params["early_stopping_rounds"] = early_stopping_rounds self._params["verbose_eval"] = verbose_eval self._params["task_type"] = "GPU" if get_gpu_device_count( ) > 0 else "CPU" self.model = CatBoost(self._params, **kwargs) # train the model self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs) evals_result = self.model.get_evals_result() evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0]
# https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db import numpy as np from catboost.utils import get_gpu_device_count import xlsxwriter import pandas as pd import numpy as np import time from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import catboost as cb from sklearn.model_selection import GridSearchCV from sklearn import metrics #import pickle print('I see %i GPU devices' % get_gpu_device_count()) start_time = time.time() data = pd.read_excel(open('merged final data.xlsx', 'rb'), sheet_name='Sheet1') data = data.replace(['Fully paid', 'Charged Off'], [0, 1]) # data['CreditScore'] = np.log(data['CreditScore']) data.drop(['CreditScore'], axis=1, inplace=True) data.drop(['Avg_cur_bal'], axis=1, inplace=True) # Tot_hi_cred_lim data.drop(['Bc_util'], axis=1, inplace=True) # Revol_util data.drop(['Total_bc_limit'], axis=1, inplace=True) # Total_rev_hi_lim train, test, y_train, y_test = train_test_split(data.drop(["LoanStatus"], axis=1), data["LoanStatus"], random_state=10, test_size=0.30) params = {'depth': [2, 6],
return os.path.join(workdir, f'{filename}-{today}.{fileext}') # 測試資料比例 # test_size_list = [ .25 ] test_size_list = [ .05, .1, .15, .2, .25, .3, .35, .4, .45, .5, .55, .6, .65, .7, .75, .80, .85, .90, .95 ] # 隨機種子 random_seed_list = [3, 106, 2019] # random_seed_list = [3] # 檢查是否有 GPU 支援 task_type = "GPU" if get_gpu_device_count() > 0 else "CPU" model_list = [ BernoulliNB(), CatBoostClassifier(task_type=task_type), DecisionTreeClassifier(), ExtraTreeClassifier(), ExtraTreesClassifier(n_jobs=-1), GaussianNB(), GradientBoostingClassifier(), GaussianProcessClassifier(multi_class='one_vs_rest', n_jobs=-1), # Out of memory KNeighborsClassifier(n_jobs=-1), LGBMClassifier(n_jobs=-1), # LabelSpreading(n_jobs = -1), # Out of Memory LinearDiscriminantAnalysis(),
import numpy as np from catboost.utils import get_gpu_device_count params = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'thread_count': -1, 'custom_metric': ['AUC:hints=skip_train~false', 'F1'], 'task_type': 'CPU' if get_gpu_device_count() > 0 else 'CPU', # 'task_type': 'GPU', # if torch.cuda.is_available() else 'CPU', 'grow_policy': 'Lossguide', # 'SymmetricTree', # 'Depthwise', # 'auto_class_weights': 'Balanced', 'langevin': True, # CPU only 'iterations': 20, 'learning_rate': 0.002, # 4e-3, 'l2_leaf_reg': 1e-1, 'depth': 16, 'max_leaves': 10, 'border_count': 128, 'verbose': 1, 'od_type': 'Iter', 'od_wait': 100, # 'early_stopping_rounds': 100, # random control 'bootstrap_type': 'Bayesian', # 'random_seed': 100, 'random_strength': 0.001, 'rsm': 1, 'bagging_temperature': 0, 'boosting_type': 'Plain', # 'Ordered'