def test_wrong_feature_count(): with pytest.raises(CatboostError): data = np.random.rand(100, 10) label = np.random.randint(2, size=100) model = CatBoostClassifier() model.fit(data, label) model.predict(data[:, :-1])
def test_full_history(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(od_type='Iter', od_wait=20, random_seed=42, approx_on_full_history=True) model.fit(train_pool, eval_set=test_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_pool_after_fit(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(TRAIN_FILE, column_description=CD_FILE) assert _check_data(pool1.get_features(), pool2.get_features()) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool2) assert _check_data(pool1.get_features(), pool2.get_features())
def test_raw_predict_equals_to_model_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool, eval_set=test_pool) pred = model.predict(test_pool, prediction_type='RawFormulaVal') assert all(model.get_test_eval() == pred)
def train_preprocessor(path='.', train='train.csv'): print('start train trash preprocessor...') df = pd.read_csv(os.path.join(path, train)) train_data = df[:-100] validation_data = df[-100: -50] vectorizer = CountVectorizer() x_train_counts = vectorizer.fit_transform(train_data.text) x_validation_counts = vectorizer.transform(validation_data.text) model = CatBoostClassifier(iterations=250, train_dir=path, logging_level='Silent', allow_writing_files=False ) model.fit(X=x_train_counts.toarray(), y=train_data.status, eval_set=(x_validation_counts.toarray(), validation_data.status), use_best_model=True,) model.save_model(os.path.join(path, 'trash_model')) joblib.dump(vectorizer,os.path.join(path, 'trash_vectorizer')) print('end train sentiment preprocessor...')
def test_ntree_limit(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=100, random_seed=0) model.fit(train_pool) pred = model.predict_proba(test_pool, ntree_end=10) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_non_ones_weight(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) weight = np.arange(1, pool.num_row()+1) pool.set_weight(weight) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros(pool.num_row()) pool.set_baseline(baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_predict_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred = model.predict(test_pool, prediction_type="Class") np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_staged_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool) preds = [] for pred in model.staged_predict(test_pool): preds.append(pred) np.save(PREDS_PATH, np.array(preds)) return local_canonical_file(PREDS_PATH)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test): best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8) best.fit(kfold_X_train, y_train) # 对验证集predict pred = best.predict_proba(kfold_X_valid) results = best.predict_proba(test) return pred, results, best
def test_ignored_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3]) model2 = CatBoostClassifier(iterations=5, random_seed=0) model1.fit(train_pool) model2.fit(train_pool) predictions1 = model1.predict(test_pool) predictions2 = model2.predict(test_pool) assert not _check_data(predictions1, predictions2) model1.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def train_catboost_model(df, target, cat_features, params, verbose=True): if not isinstance(df, DataFrame): raise Exception('DataFrame object expected, but got ' + repr(df)) print 'features:', df.columns.tolist() cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features) print 'cat features:', cat_features_index model = CatBoostClassifier(**params) model.fit(df, target, cat_features=cat_features_index, verbose=verbose) return model
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def model_1(X,y,test): ''' This is a catBoost model where we need not to encode categorical variables. It automatically takes care of them. ''' categorical_features_indices = np.where(X.dtypes != np.float)[0] X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234) #importing library and building model cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy') cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True) #calculating the class wise prediction probability of cboost model pred_prob=cboost.predict_proba(test) return pred_prob
class BesCatBoost: """ catboost_params = { 'iterations': 500, 'depth': 3, 'learning_rate': 0.1, 'eval_metric': 'AUC', 'random_seed': 42, 'logging_level': 'Verbose', 'l2_leaf_reg': 15.0, 'bagging_temperature': 0.75, 'allow_writing_files': False, 'metric_period': 50 } """ def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None): self.params = params self.metric = metric self.maximize = maximize self.verbose = verbose self.model = model def fit(self, X_train, y_train): bst = cv( Pool(X_train, y_train), self.params ) best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1 print('Best Iteration: {}'.format(best_rounds)) self.params['iterations'] = best_rounds self.model = CatBoostClassifier(**self.params) self.model.fit( X_train, y_train ) def predict(self, X_test): pred_prob = self.model.predict_proba(X_test)[:, -1] return pred_prob def feature_importance(self): pass @staticmethod def find_best_params(kag): pass
def test_custom_objective(): class LoglossObjective(object): def calc_ders_range(self, approxes, targets, weights): assert len(approxes) == len(targets) if weights is not None: assert len(weights) == len(approxes) exponents = [] for index in xrange(len(approxes)): exponents.append(math.exp(approxes[index])) result = [] for index in xrange(len(targets)): p = exponents[index] / (1 + exponents[index]) der1 = (1 - p) if targets[index] > 0.0 else -p der2 = -p * (1 - p) if weights is not None: der1 *= weights[index] der2 *= weights[index] result.append((der1, der2)) return result train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function=LoglossObjective(), eval_metric="Logloss", # Leaf estimation method and gradient iteration are set to match # defaults for Logloss. leaf_estimation_method="Newton", leaf_estimation_iterations=10) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool, prediction_type='RawFormulaVal') model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal') for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_custom_eval(): class LoglossMetric(object): def get_final_error(self, error, weight): return error / (weight + 1e-38) def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) approx = approxes[0] error_sum = 0.0 weight_sum = 0.0 for i in xrange(len(approx)): w = 1.0 if weight is None else weight[i] weight_sum += w error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i]))) return error_sum, weight_sum train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric()) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool) model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool) for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_fit_no_label(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool.get_features())
class CatBoostClassifierCV(object): """cross_val_predict""" def __init__(self, params=None, cv=5, random_state=None, n_repeats=None): self.clf = CatBoostClassifier() if params: self.clf.set_params(**params) if n_repeats: self._kf = RepeatedStratifiedKFold(cv, True, random_state) self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, True, random_state) self._num_preds = cv def fit(self, X, y, X_test, feval=roc_auc_score, cat_features=None, sample_weight=None, verbose=100, early_stopping_rounds=100, plot=False, silent=None, logging_level=None, column_description=None, save_snapshot=None, snapshot_file='/fds/data' if cloudml else None, snapshot_interval=None, init_model=None): """输入数组""" self.oof_train = np.zeros(len(X)) self.oof_test = np.zeros((len(X_test), self._num_preds)) for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)): if verbose: print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime())) X_train, y_train = X[train_index], y[train_index] X_valid, y_valid = X[valid_index], y[valid_index] eval_set = [(X_train, y_train), (X_valid, y_valid)] ######################################################################## self.clf.fit(X_train, y_train, cat_features=cat_features, sample_weight=sample_weight, use_best_model=True, eval_set=eval_set, verbose=verbose, logging_level=logging_level, plot=plot, column_description=column_description, silent=silent, early_stopping_rounds=early_stopping_rounds, save_snapshot=save_snapshot, snapshot_file=snapshot_file, snapshot_interval=snapshot_interval, init_model=init_model) self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1] self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1] ######################################################################## # 输出 测试集 oof self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len(self.oof_test) self.oof_test = self.oof_test.mean(1) # 计算 训练集 oof 得分 if feval: score = feval(y, self.oof_train) print(f"\n\033[94mCV Score: {score} ended at {time.ctime()}\033[0m") return score def oof_save(self, file='./oof_train_and_test.csv'): assert isinstance(file, str) _ = np.append(self.oof_train, self.oof_test) pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
def test_invalid_loss_classifier(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(loss_function="abcdef") model.fit(pool)
#10) border_count: The number of splits for numerical features. Allowed values are integers from 1 to 255 inclusively. #11) ctr_border_count: The number of splits for categorical features. Allowed values are integers from 1 to 255 inclusively. #12) leaf_estimation_method: The method used to calculate the values in leaves. Possible values: i)Newton ii)Gradient #13) gradient_iterations: The number of gradient steps when calculating the values in leaves. #14) priors: NEED TO EXPLORE ; Use the specified priors during training. Format: <prior 1>:<prior 2>:...:<prior N>; For example:–2:0:0.5:10 #15) feature_priors : NEED TO EXPLORE ; Specify individual priors for categorical features (used at the Transforming categorical features to numerical features stage). Given in the form of a comma-separated list of prior descriptions for each specified feature. The description for each feature contains a colon-separated feature index and prior values. Format:<ID of feature 1>:<prior 1.1>:<prior 1.2>:...:<prior 1.N1>,...,<ID of feature M>:<prior M.1>:<prior M.2>:...:<prior M.NM> #16) fold_permutation_block_size: Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. #17) has_time: Use the order of objects in the input data (do not perform random permutations during the Transforming categorical features to numerical features and Choosing the tree structure stages). #18) fold_len_multiplier: Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. With values close to 1 (for example, ), each iteration takes a quadratic amount of memory and time for the number of objects in the iteration. Thus, low values are possible only when there is a small number of objects. #For Binary Classification #model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss') model.fit(X_train[features], X_train['noOfLanes_encoded'], cat_features=cat_cols, eval_set=(X_valid[features], X_valid['noOfLanes_encoded']), use_best_model=True) pred = model.predict(X_test[features]) pred_ans = list(pred[:, 0]) #To get probability of predictions #pred = model.predict_proba(X_test[features])[:,1] #To get raw #pred = model.predict(X_test[features],prediction_type='RawFormulaVal') ####OTHER NOTE: Regression using CatBoost #from catboost import CatBoostRegressor #model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2) # Fit model #model.fit(train_data, train_labels, cat_features)
def run(scheme_num=1, file_name="../data/data_v3/training_e"): train_set_ls = [] if scheme_num == 1: for i in [16, 17, 22, 23]: print("begin to load the dataset") file_name1 = file_name + "ld1-" + str(i) + ".csv" train_set_temp = pd.read_csv(file_name1, header=0, index_col=None) print(train_set_temp.describe()) train_set_ls.append(train_set_temp) elif scheme_num == 2: for i in [16, 23]: print("begin to load the dataset") file_name2 = file_name + "ld1-" + str(i) + ".csv" train_set_temp = pd.read_csv(file_name2, header=0, index_col=None) print(train_set_temp.describe()) train_set_ls.append(train_set_temp) elif scheme_num == 3: for i in [17,18, 19, 20, 21, 22, 23]: print("begin to load the dataset") file_name3 = file_name + "ld1-" + str(i) + ".csv" train_set_temp = pd.read_csv(file_name3, header=0, index_col=None) print(train_set_temp.describe()) train_set_ls.append(train_set_temp) val_file_name = file_name + "ld1-23.csv" val_set = pd.read_csv(val_file_name, header=0, index_col=None) print(val_set.describe()) train_set = pd.concat(train_set_ls, axis=0) ds = train_set.describe() print(ds) keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) print("begin to drop the duplicates") train_set.drop_duplicates(subset=keep_feature, inplace=True) val_set.drop_duplicates(subset=keep_feature, inplace=True) print(train_set.describe()) print(val_set.describe()) train_label = train_set["label"] val_label = val_set["label"] train_set = train_set.drop(labels=["label", "user_id"], axis=1) val_set = val_set.drop(labels=["label", "user_id"], axis=1) print("begin to standardization the data") for fea in keep_feature: if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001: train_set.drop(labels=[fea], axis=1, inplace=True) val_set.drop(labels=[fea], axis=1, inplace=True) else: train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min()) # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std()) val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min()) # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std()) keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) kpca = PCA(n_components=0.99, whiten=True) # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1) kpca.fit(train_set.values) train_set = kpca.transform(train_set.values) val_set = kpca.transform(val_set.values) # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_)) print("number of components {}".format(kpca.n_components_)) print("noise variance {}".format(kpca.noise_variance_)) print("the explained variance {}".format(kpca.explained_variance_)) print("the explained variance ratio {}".format(kpca.explained_variance_ratio_)) print("begin to make prediction with plain features and without tuning parameters") initial_params = { "colsample_bytree": 0.9956575704604527, "learning_rate": 0.03640520807213964, "max_bin": 210, # "max_depth":7, "min_child_samples": 80, "min_child_weight": 0.23740522733908753, # "min_split_gain": 0.0004147079426427973, "n_estimators": 266, "num_leaves": 12, "reg_alpha": 271.01549892268713, "reg_lambda": 0.0001118074055642654, # "scale_pos_weight": 0.9914246775102074, "subsample": 0.9090257022233618, "boosting_type": "dart", } # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) # best_f1 =0.0 # best_params = {"n_estimators":800,"num_leaves":6} # for n_estimator in [400,600,800]: # for num_leave in [4,6,8]: # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"}) # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart") # clf1.fit(train_set.values, train_label.values) # print("load the test dataset") # yhat = clf1.predict(val_set.values) # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4)) # f1 = f1_score(y_pred=yhat, y_true=val_label.values) # if best_f1<f1: # best_f1 = f1 # best_params = {"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"} scoring = {'f1': "f1"} # clf1 = GridSearchCV(LGBMClassifier(), # param_grid={"n_estimators":[200,400,600],"num_leaves": [4,5,6,8],"boosting_type":["dart"]}, # scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=1) for n_estimator in [500]: for depth in [6]: print({"n_estimators": n_estimator, "depth": depth}) clf1 = CatBoostClassifier(iterations=n_estimator, depth=depth,verbose=2) # clf1.fit(train_set.values, train_label.values) clf1.fit(train_set, train_label.values) # clf1.fit(train_set.values, train_label.values,eval_set=(val_set.values,val_label.values),early_stopping_rounds=30) # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True) # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30) # bs = clf1.best_score_ # print(bs) # bp = clf1.best_params_ # print(bp) print("begin to make classification report for the validation dataset") # yhat = clf1.predict(val_set.values) # yhat = clf1.predict(val_set.values) yhat = clf1.predict(val_set) print(classification_report(y_pred=yhat, y_true=val_label.values, digits=4)) print("begin to make classification report for the training dataset") # yhat = clf1.predict(train_set.values) yhat = clf1.predict(train_set) print(classification_report(y_pred=yhat, y_true=train_label.values, digits=4)) print("load the test dataset") test_file_name = file_name.replace("training", "testing") + "ld1-30.csv" test_set = pd.read_csv(test_file_name, header=0, index_col=None, usecols=keep_feature + ["user_id"]) # test_set = pd.read_csv("data/testing_rld1-30.csv",header=0,index_col=None) for fea in keep_feature: test_set[fea] = (test_set[fea] - test_set[fea].min()) / (test_set[fea].max() - test_set[fea].min()) # test_set[fea] = (test_set[fea]-test_set[fea].mean())/(test_set[fea].std()) print("begin to make prediction") param = list(file_name)[-1] + str(scheme_num) + "_" + str(n_estimator) + "_" + str(depth) print(param) # predict(clf1,test_set,param) predict(clf1, test_set, param, kpca)
def test_wrong_ctr_for_classification(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform']) model.fit(pool)
train_mod_std = feather.read_dataframe(path_train_mod_std) test_mod_std = feather.read_dataframe(path_test_mod_std) features = [c for c in train.columns if c not in ['ID_code', 'target']] target = train['target'] target_std = train_std['target'] target_mod = train_mod['target'] target_mod_std = train_mod_std['target'] print("data install complete") # feature importances ----------------------------------------------------------------- print("feature importances -------------") model = CatBoostClassifier(random_state=0) model.fit(train_mod[features], target) importances = list(model.feature_importances_) columns = list(train_mod[features].columns) importances = pd.DataFrame(model.feature_importances_, columns=["importances"]) columns = pd.DataFrame(train_mod[features].columns, columns=["variable"]) data = pd.concat([columns, importances], axis=1) sort_data = data.sort_values(by="importances", ascending=False).reset_index(drop=True) print( data.sort_values(by="importances", ascending=False).reset_index(drop=True).head(15)) for i in np.arange(50, train_mod[features].shape[1], 50): print("sum of importances by highest {} features: {}".format(
def test_feature_importance_off(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False) model.fit(pool) model.feature_importances_
class CatboostBaseline(BaseBaseline): def __init__(self): super(CatboostBaseline, self).__init__(name="catboost") def fit(self, X_train, y_train, X_val, y_val, categoricals=None): results = dict() self.all_nan = np.all(np.isnan(X_train), axis=0) X_train = X_train[:, ~self.all_nan] X_val = X_val[:, ~self.all_nan] X_train = np.nan_to_num(X_train) X_val = np.nan_to_num(X_val) categoricals = [ ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str) ] early_stopping = 150 if X_train.shape[0] > 10000 else max( round(150 * 10000 / X_train.shape[0]), 10) X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals) X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals) self.model = CatBoostClassifier(**self.config) self.model.fit(X_train_pooled, eval_set=X_val_pooled, use_best_model=True, early_stopping_rounds=early_stopping) pred_train = self.model.predict_proba(X_train) pred_val = self.model.predict_proba(X_val) results["val_preds"] = pred_val.tolist() results["labels"] = y_val.tolist() try: pred_train = np.argmax(pred_train, axis=1) pred_val = np.argmax(pred_val, axis=1) except: print("==> No probabilities provided in predictions") results["train_acc"] = metrics.accuracy_score(y_train, pred_train) results["train_balanced_acc"] = metrics.balanced_accuracy_score( y_train, pred_train) results["val_acc"] = metrics.accuracy_score(y_val, pred_val) results["val_balanced_acc"] = metrics.balanced_accuracy_score( y_val, pred_val) return results def score(self, X_test, y_test): results = dict() y_pred = self.predict(X_test) results["test_acc"] = metrics.accuracy_score(y_test, y_pred) results["test_balanced_acc"] = metrics.balanced_accuracy_score( y_test, y_pred) return results def predict(self, X_test, predict_proba=False): X_test = X_test[:, ~self.all_nan] X_test = np.nan_to_num(X_test) if predict_proba: return self.model.predict_proba(X_test) y_pred = self.model.predict(X_test) return y_pred
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def test_interaction_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction'))) return local_canonical_file(FIMP_PATH)
def test_classification_ctr(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter']) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_class_weights(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.feature_importances_)) return local_canonical_file(FIMP_PATH)
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.15, one_hot_max_size=31, loss_function='Logloss', logging_level='Verbose', custom_loss='AUC', eval_metric='AUC', rsm=0.78, od_wait=150, metric_period=400, l2_leaf_reg=9, random_seed=967) model.fit(X_train, y_train, plot=True, cat_features=categorical_features_indices) import matplotlib.pyplot as plt fea_ = model.feature_importances_ #feature importance plot fea_name = model.feature_names_ plt.figure(figsize=(10, 10)) plt.barh(fea_name, fea_, height=0.5) #AUC-ROC curve/FPR-TPR curve from catboost.utils import get_roc_curve import sklearn from sklearn import metrics from catboost import Pool eval_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) eval_train_pool = Pool(X_train, y_train,
import sys from lib.utils import read_training_file from catboost import Pool, CatBoostClassifier TRAIN_FILE_PATH = sys.argv[1] print('Training File Path: {}'.format(TRAIN_FILE_PATH)) data, label = read_training_file(TRAIN_FILE_PATH) train_pool = Pool(data, label) model = CatBoostClassifier() model.fit(train_pool) model.save_model('model')
def test_priors(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, ctr_description=["Borders:Prior=0:Prior=0.6:Prior=1:Prior=5", "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
random_state=42) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42) model = CatBoostClassifier( iterations=500, depth=10, learning_rate=0.1, loss_function='MultiClass', # bagging_temperature=2, # l2_leaf_reg=4) ) model.fit(pd.DataFrame(x_train), pd.DataFrame(y_train), eval_set=(pd.DataFrame(x_val), pd.DataFrame(y_val)), plot=True) features = [ "Name_length", "Name_frequency", "Named/un-named", "Year", "Month", "Day", "Hour", "Day_of_week" "Male/Female", "Neutered/Intact", "Age", "Breed1",
print("Train Index:",train_idx,",Val Index:",valid_idx) clf = CatBoostClassifier( iterations=7500, learning_rate=0.02, depth=6, bootstrap_type='Bernoulli', l2_leaf_reg=50, #loss_function='auc', eval_metric='AUC', verbose=True,) train_pool = Pool(train_x, train_y) validate_pool = Pool(valid_x, valid_y) clf.fit(train_pool, use_best_model=True, eval_set=validate_pool) oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1] sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) oof = pd.DataFrame({"SK_ID_CURR":train_df["SK_ID_CURR"], "TARGET":oof_preds}) preds = pd.DataFrame({"SK_ID_CURR":test_df["SK_ID_CURR"], "TARGET":sub_preds})
def train_model_classification(X, y, params, groups, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, weight=None, seed='no'): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame :params: X_test - test data, can be pd.DataFrame :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns models = [] metrics_dict = { 'auc': { 'lgb_metric_name': 'auc', 'catboost_metric_name': 'AUC', 'sklearn_scoring_function': metrics.roc_auc_score }, } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) scores = [] train_loss = [] feature_importance = pd.DataFrame() if groups is None: splits = folds.split(X) elif groups == 'stra': splits = folds.split(X, y) else: splits = folds.split(X, groups=groups) print('no') for fold_n, (train_index, valid_index) in enumerate(splits): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] weight_train = weight[train_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] weight_train = weight[train_index] if model_type == 'lgb': model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, sample_weight=weight_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred_train = model.predict_proba(X_train)[:, 1] models.append(model) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred_train = model.predict(xgb.DMatrix(X_train, feature_names=X.columns), ntree_limit=model.best_ntree_limit) models.append(model) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') models.append(model) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, ) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred_train = model.predict_proba(X_train)[:, 1] models.append(model) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) train_loss.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_train, y_pred_train)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) with open(f'./models/models_{model_type}_{seed}.pickle', 'wb') as handle: pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL) gc.collect() if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["shap_values"] = abs( shap.TreeExplainer(model).shap_values(X_valid) [:, :len(columns)]).mean(axis=0).T fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) print('Train loss mean: {0:.6f}, std: {1:.6f}.'.format( np.mean(train_loss), np.std(train_loss))) print('CV mean score: {0:.6f}, std: {1:.6f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['scores'] = scores result_dict['models'] = models if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= folds.n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') result_dict['feature_importance'] = feature_importance return result_dict
od_type="Iter", od_wait=200, task_type="GPU", devices="0", cat_features=[x for x in range(len(cb_cat_features))], bagging_temperature=1.288692494969795, grow_policy="Depthwise", l2_leaf_reg=9.847870133539244, learning_rate=0.01877982653902465, max_depth=8, min_data_in_leaf=1, penalties_coefficient=2.1176668909602734, ) cb_model.fit( cb_x_train, y_train, eval_set=[(cb_x_valid, y_valid)], verbose=0, ) train_oof_preds = cb_model.predict_proba(cb_x_valid)[:,1] test_oof_preds = cb_model.predict_proba(test[cb_features])[:,1] cb_train_preds[test_index] = train_oof_preds cb_test_preds += test_oof_preds / n_folds print(": CB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro"))) ridge_model = CalibratedClassifierCV( RidgeClassifier(random_state=random_state), cv=3, ) ridge_model.fit( ridge_x_train,
X_test = X_test.fillna(-999) assert X_train.shape == (590540, 432) assert X_test.shape == (506691, 432) del train, test_identity, test_transaction, train_identity, train_transaction #%% Create catboost data pool # catboost automatically transform categorical features into encoded format CAT_FEATURES = list(X_train.select_dtypes("object").columns) train_dataset = Pool(data=X_train, label=Y_train, cat_features=CAT_FEATURES) test_dataset = Pool(data=X_test, cat_features=CAT_FEATURES) #%% train model model = CatBoostClassifier(iterations=1000, task_type="GPU") model.fit(train_dataset, verbose=True) #%% save model and predict if not os.path.exists("./result/"): os.makedirs("./result/") model.save_model("./result/model_catboost.json", format="json") test_pred = model.predict_proba(test_dataset, verbose=True)[:, 1] ss["isFraud"] = test_pred ss.to_csv("./result/submit_catboost.csv") #%% feature importance import matplotlib.pylab as plt fi = pd.DataFrame(index=model.feature_names_) fi['importance'] = model.feature_importances_ fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
def test_no_eval_set(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool, use_best_model=True)
continue X.append((dot.log, dot.lat, log(dot.trans_ts - b, a), log(dot.request_ts - b, c))) y.append(dot.label) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) train_p = Pool(X, y) test_p = Pool(X_test, y_test) decision = CatBoostClassifier(iterations=35, learning_rate=1, depth=10, loss_function='MultiClass', custom_metric='MultiClassOneVsAll', best_model_min_trees=10000) decision.fit(train_p) print('Accuracy: \n', decision.score(test_p)) pred = decision.predict(TEST) print(decision.feature_importances_) plt.bar(np.arange(len(decision.feature_importances_)), decision.feature_importances_, color='black') plt.show() with open("answerboost2.txt", 'w') as f: for item in pred: f.write(f"{int(item)}\n")
axis=1) y2 = forcatboost2['churn'] X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=.3, random_state=0) # In[154]: categorical_features_indices2 = np.where(X2.dtypes != np.float)[0] # In[155]: model2 = CatBoostClassifier() model2.fit(X_train2, y_train2, cat_features=categorical_features_indices2, eval_set=(X_test2, y_test2)) # In[156]: print('Accuracy of CatBoost classifier on training set: {:.2f}'.format( model2.score(X_train2, y_train2))) print('Accuracy of CatBoost classifier on test set: {:.2f}'.format( model2.score(X_test2, y_test2))) # In[157]: model2.get_feature_importance() # In[158]:
def train_fold(self, data, fold_idx): x_train, y_train, x_val, y_val = data model = CatBoostClassifier(**hparams.catboost) with kts.parse_stdout(kts.patterns.catboost, kts.LoggerCallback(logger=self.logger, FOLD=fold_idx)): model.fit(x_train, y_train, eval_set=[(x_val, y_val)]) return model
def model(df4, df_val, dfk): X_train, X_test, Y_train, Y_test = train_test_split(df4.drop(columns = ['set_clicked']) , df4['set_clicked'], test_size = 0.30 ##### change to lightgbm # , random_state = 42 ) print('Ones in train :',Y_train.sum(),'Ones in test:',Y_test.sum()) rand = random.randint(1,2) #################################### LGBM # if rand == 1: # params = {'boosting_type': 'gbdt', # 'max_depth' : 4, # 'objective': 'binary', # 'nthread': 4, # 'num_leaves': 64, # 'learning_rate': 0.001, # 'max_bin': 512, # 'subsample_for_bin': 200, # 'subsample': 1, # 'subsample_freq': 1, # 'colsample_bytree': 0.8, # 'reg_alpha': 1.2, # 'reg_lambda': 1.2, # 'min_split_gain': 0.5, # 'min_child_weight': 1, # 'min_child_samples': 5, # 'scale_pos_weight': 1, # 'num_class' : 1, # 'verbose': -1 # # 'metric' : 'auc' # } # # # making lgbm datasets for train and valid # d_train = lgbm.Dataset(X_train, Y_train) # d_valid = lgbm.Dataset(X_test, Y_test) # def lgb_f1_score(y_hat, data): # y_true = data.get_label() # y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities # return 'f1', f1_score(y_true, y_hat), True # evals_result = {} # # training with early stop # # bst = lgbm.train(params, d_train, 5000, valid_sets=[d_valid], verbose_eval=50, early_stopping_rounds=100) # # cat_vars_index = [] # # for i in cat_vars: # # if i in X_train: # # cat_vars_index.append(X_train.columns.get_loc(i)) # bst = lgbm.train(params, d_train, valid_sets=[d_valid, d_train], valid_names=['val', 'train'], feval=lgb_f1_score, evals_result=evals_result) #################################### LGBM #################################### XGBoost if rand == 1: bst = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4, min_child_weight=1, missing=None, n_estimators=100, nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1) #, tree_method = 'hist' bst.fit(X_train,Y_train) # kfold = KFold(n_splits=10, random_state=42) ##### Important Parameter # results = cross_val_score(bst, df_val.drop(columns = ['set_clicked']), df_val['set_clicked'], cv=kfold) #################################### XGBoost #################################### CatBoost if rand == 2: bst = CatBoostClassifier(eval_metric='F1',use_best_model=True, metric_period = 300, depth = 4) bst.fit(X_train,Y_train,eval_set=(X_test,Y_test)) ## cat_features = cat_vars_index #################################### CatBoost # print("CV Score = ",results) # else: # bst = dt(max_depth = 4) # class_weight = {0:1,1:4} # bst.fit(X_train, Y_train) r = np.where(bst.predict(df_val.drop(columns = ['set_clicked'])) > 0.7, 1 ,0) # if rand in [2,4]: # kfold = KFold(n_splits=10, random_state=42) ##### Important Parameter # results = cross_val_score(bst, df_val.drop(columns = ['set_clicked']), df_val['set_clicked'], cv=kfold) # else: # results = [0] results = [] for i in range(10): df_val2 = shuffle(df_val) df_val3 = df_val2[0:int(df_val2.shape[0]*0.7)] rkf = bst.predict(df_val3.drop(columns = ['set_clicked'])) results.append(accuracy_score(df_val3['set_clicked'], rkf)) #Print accuracy acc_lgbm = accuracy_score(df_val['set_clicked'], r) print('Overall accuracy of model:', acc_lgbm, " overall with only zeroes ", accuracy_score(df_val['set_clicked'], np.zeros(len(r)))) check_increase = accuracy_score(df_val['set_clicked'], r) > accuracy_score(df_val['set_clicked'], np.zeros(len(r))) # print('Accuracy increased:',check_increase) #Print Area Under Curve # plt.figure() false_positive_rate, recall, thresholds = roc_curve(df_val['set_clicked'], r) roc_auc = auc(false_positive_rate, recall) # plt.title('Receiver Operating Characteristic (ROC)') # plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc) # plt.legend(loc='lower right') # plt.plot([0,1], [0,1], 'r--') # plt.xlim([0.0,1.0]) # plt.ylim([0.0,1.0]) # plt.ylabel('Recall') # plt.xlabel('Fall-out (1-Specificity)') # plt.show() print('AUC score:', roc_auc) #Print Confusion Matrix plt.figure() cm = confusion_matrix(df_val['set_clicked'], r) # labels = ['No Default', 'Default'] plt.figure(figsize=(8,6)) sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2); plt.title('Confusion Matrix') plt.ylabel('True Class') plt.xlabel('Predicted Class') plt.show() # lgbm.plot_metric(evals_result, metric='f1') f1 = f1_score(df_val['set_clicked'], r) # print(np.unique(bst.predict(dfk.drop(columns = ['set_clicked'])))) rk = np.where(bst.predict(dfk.drop(columns = ['set_clicked'])) > 0.7, 1, 0) dfk['set_clicked'] = rk print("Completed Modelling - ",datetime.datetime.now()) return acc_lgbm, f1, check_increase, X_train.columns, bst, dfk, results, accuracy_score(df_val['set_clicked'], np.zeros(len(r))), r, rand
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=23) for i in scale_list: for j in iteration_list: scale = i scale.fit(x_train) x_train = scale.transform(x_train) x_test = scale.transform(x_test) # for j in model_list: model = CatBoostClassifier(iterations=j) model.fit(x_train, y_train) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) loss = log_loss(y_test, y_pred) print( 'scaler : \n' + str(i) + '\nmodel : CatBoostClassifier\n' + 'acc : \n', acc) print('\nloss : \n', loss) pickle.dump( model, open( 'c:/data/modelcheckpoint/project_catboost_lr_default_' + str(i) + '_' + str(j) + '.data', 'wb'))
#Let us save the 4 best models top_12_rf = [ 'PAY_1', 'PAY_2', 'BILL_AMT1', 'PAY_AMT1', 'AGE', 'Closeness_1', 'PAY_AMT2', 'Closeness_4', 'BILL_AMT2', 'Closeness_2' ] sclf1.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) sclf2.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) sclf3.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) sclf4.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) eclf1.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) eclf2.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) catboost_model.fit(sampled_df[top_12_rf], sampled_df['Default_Status']) import pickle #Saving the four models to a disk pickle.dump(sclf1, open("stacking1.pkl", "wb")) pickle.dump(sclf2, open("stacking2.pkl", "wb")) pickle.dump(sclf3, open("stacking3.pkl", "wb")) pickle.dump(sclf4, open("stacking4.pkl", "wb")) pickle.dump(catboost_model, open("catboost.pkl", "wb")) pickle.dump(eclf1, open("voting1.pkl", "wb")) pickle.dump(eclf2, open("voting2.pkl", "wb")) #!pip freeze > requirements.txt
def test_predict_sklearn_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) #X_test[104][10] = "yes" cat_featuresind=[0,1,2] clf = CatBoostClassifier (iterations=10,random_seed=rnd_state, custom_metric='Accuracy') clf.fit(X_train, y_train, cat_features=cat_featuresind,plot = True) clf.score(X_test, y_test) from sklearn.metrics import confusion_matrix,accuracy_score y_pred = clf.predict(X_test) #print(clf.predict(X_test[104])) cm = confusion_matrix (y_test, y_pred)
class CatBoostLearner(Learner): algorithm_name = "CatBoost" algorithm_short_name = "CatBoost" def __init__(self, params): super(CatBoostLearner, self).__init__(params) self.library_version = catboost.__version__ self.model_file = self.uid + ".cat.model" self.model_file_path = os.path.join(storage_path, self.model_file) self.snapshot_file_path = os.path.join( storage_path, "training_snapshot_" + self.model_file) self.rounds = additional.get("one_step", 50) self.max_iters = additional.get("max_steps", 10) self.learner_params = { "learning_rate": self.params.get("learning_rate", 0.025), "depth": self.params.get("depth", 6), "rsm": self.params.get("rsm", 1), "random_strength": self.params.get("random_strength", 1), "bagging_temperature": self.params.get("bagging_temperature", 1), "l2_leaf_reg": self.params.get("l2_leaf_reg", 3), "random_seed": self.params.get("seed", 1), } log.debug("CatBoostLearner __init__") self.model = CatBoostClassifier( iterations=0, learning_rate=self.learner_params.get("learning_rate"), depth=self.learner_params.get("depth"), rsm=self.learner_params.get("rsm"), random_strength=self.learner_params.get("random_strength"), bagging_temperature=self.learner_params.get("bagging_temperature"), l2_leaf_reg=self.learner_params.get("l2_leaf_reg"), loss_function="Logloss", verbose=False, ) def update(self, update_params): pass # here should be update def fit(self, X, y): self.model._init_params["iterations"] += self.rounds self.model.fit(X, y, save_snapshot=True, snapshot_file=self.snapshot_file_path) def predict(self, X): return self.model.predict_proba(X)[:, 1] def copy(self): return copy.deepcopy(self) def save(self): self.model.save_model(self.model_file_path) json_desc = { "library_version": self.library_version, "algorithm_name": self.algorithm_name, "algorithm_short_name": self.algorithm_short_name, "uid": self.uid, "model_file": self.model_file, "model_file_path": self.model_file_path, "params": self.params, } log.debug("CatBoostLearner save model to %s" % self.model_file_path) return json_desc def load(self, json_desc): self.library_version = json_desc.get("library_version", self.library_version) self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) self.algorithm_short_name = json_desc.get("algorithm_short_name", self.algorithm_short_name) self.uid = json_desc.get("uid", self.uid) self.model_file = json_desc.get("model_file", self.model_file) self.model_file_path = json_desc.get("model_file_path", self.model_file_path) self.params = json_desc.get("params", self.params) log.debug("CatBoostLearner load model from %s" % self.model_file_path) self.model = CatBoostClassifier() self.model.load_model(self.model_file_path) def importance(self, column_names, normalize=True): return None
def test_wrong_ctr_for_classification(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(ctr_description=['Borders:5:Uniform']) model.fit(pool)
def catboost_classification_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an CatBoost classifier to the dataset. It first generates a DMatrix with the specified features and labels from `df`. Then, it fits a CatBoost model to this DMatrix. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. learning_rate : float Float in the range (0, 1] Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the eta hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html num_estimators : int Int in the range (0, inf) Number of boosted trees to fit. See the n_estimators hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value}. Other parameters for the CatBoost model. See the list in: https://catboost.ai/docs/concepts/python-reference_catboostregressor.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes). weight_column : str, optional The name of the column with scores to weight the data. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ from catboost import Pool, CatBoostClassifier import catboost weights = df[weight_column].values if weight_column else None params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) params = params if "objective" in params else assoc( params, "objective", 'Logloss') features = features if not encode_extra_cols else expand_features_encoded( df, features) cat_features = params["cat_features"] if "cat_features" in params else None dtrain = Pool(df[features].values, df[target].values, weight=weights, feature_names=list(map(str, features)), cat_features=cat_features) cat_boost_classifier = CatBoostClassifier(iterations=num_estimators, **params) cbr = cat_boost_classifier.fit(dtrain, verbose=0) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: dtest = Pool(new_df[features].values, feature_names=list(map(str, features)), cat_features=cat_features) pred = cbr.predict_proba(dtest)[:, 1] if params["objective"] == "MultiClass": pred = cbr.predict_proba(dtest) col_dict = { prediction_column + "_" + str(key): value for (key, value) in enumerate(pred.T) } col_dict.update({prediction_column: pred.argmax(axis=1)}) else: col_dict = {prediction_column: pred} if apply_shap: import shap explainer = shap.TreeExplainer(cbr) shap_values = explainer.shap_values(dtest) shap_expected_value = explainer.expected_value if params["objective"] == "MultiClass": shap_values_multiclass = { f"shap_values_{class_index}": list(value) for (class_index, value) in enumerate(shap_values) } shap_expected_value_multiclass = { f"shap_expected_value_{class_index}": np.repeat(expected_value, len(class_shap_values)) for (class_index, (expected_value, class_shap_values) ) in enumerate(zip(shap_expected_value, shap_values)) } shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass) else: shap_values = list(shap_values) shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value, len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner", shap=True) log = { 'catboost_classification_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "catboost", 'package_version': catboost.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': cbr.feature_importances_, 'training_samples': len(df) }, 'object': cbr } return p, p(df), log
class CatBoostKfold(object): def __init__(self, *, input_path_1, input_path_2, output_path): self.__input_path_1 = input_path_1 self.__input_path_2 = input_path_2 self.__output_path = output_path self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_res, self.__test_res = [None for _ in range(2)] self.__train_feature, self.__train_label = [None for _ in range(2)] self.__test_feature = None self.__categorical_index = None self.__encoder = None self.__numeric_index = None self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__cat = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv")) self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv")) self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv")) self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv")) self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[self.__train_feature.columns] self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1) self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1) self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0] self.__train_feature.iloc[:, self.__categorical_index] = ( self.__train_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__test_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label) self.__train_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index]) ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index]) ) # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset" self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0] self.__train_feature.iloc[:, self.__numeric_index] = ( self.__train_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) self.__test_feature.iloc[:, self.__numeric_index] = ( self.__test_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label) def model_fit(self): self.__folds = StratifiedKFold(n_splits=5, shuffle=True) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx] self.__cat = CatBoostClassifier( iterations=6000, od_wait=200, od_type="Iter", eval_metric="AUC" ) self.__cat.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], use_best_model=True ) pred_val = self.__cat.predict_proba(val_x)[:, 1] pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # In[17]: # {'depth': 7, 'iterations': 200, 'learning_rate': 0.1, 'scale_pos_weight': 4} model_cat = CatBoostClassifier(iterations=200, depth=7, learning_rate=0.1, scale_pos_weight=4) model_cat.fit(X_train, y_train) # In[18]: # test_y_pred4 = model_cat.predict(X_test) # In[20]: # make predictions on a test set and get AUC score # print("Sklearn CatBoost classifier:") # y_pred = model_cat.predict_proba(X_test) # print(f" - roc_auc_score: {roc_auc_score(y_test, y_pred[:,1]): .5f}") # Accuracy score, f1_score
def train(self, feature_names): """ Input: feature_names: directionary of features' names Output: validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict") """ # Initialize parameters validity = None model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) feature_importance = pd.DataFrame() START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None # Process for each fold for fold in range(START_FOLD, END_FOLD): log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold)) Path.mkdir(log_path, exist_ok=True, parents=True) # Measure start time of the classification of this fold start = time.time() getLogger(get_version()).info("\t >> {} folds start".format(fold)) send_message("\t :cat: {} folds start".format(fold)) # Generate dataset getLogger(get_version()).info("\t \t Generating datasets...") send_message("\t \t Generating datasets...") valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(feature_names, valid, "train") val_x = super().get_feature_df(feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) getLogger(get_version()).info("\t \t Datasets were generated.") send_message("\t \t Datasets were generated.") # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify clf = CatBoostClassifier(iterations=self.params["iterations"], verbose=self.params["verbose"], early_stopping_rounds=self.params["early_stopping_rounds"], random_seed=self.params["random_seed"], max_depth=self.params["max_depth"], loss_function=self.params["loss_function"], custom_metric=self.params["custom_metric"], eval_metric=self.params["eval_metric"], rsm=self.params["rsm"], train_dir=str(log_path)) clf.fit(trn_x.values, trn_y.values, eval_set=(val_x.values, val_y.values)) for train_or_valid, metrics in clf.best_score_.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score)) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1] # Calculate feature importance per fold if fold == 0: feature_importance["feature"] = trn_x.columns feature_importance["fold{}".format(fold)] = clf.get_feature_importance() # Measure finish time of the classification of this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec)) # Post-process this fold clf.save_model(str(model_path / "valid{}.model".format(fold))) # Output CV score validity = output_cv(validity, ":cat:") # Save importance directory_path = Path(__file__).absolute().parents[2] / "importance" save_feature_importance(feature_importance, directory_path) # Post-process the training del feature_importance gc.collect() return validity