def test_wrong_feature_count(): with pytest.raises(CatboostError): data = np.random.rand(100, 10) label = np.random.randint(2, size=100) model = CatBoostClassifier() model.fit(data, label) model.predict(data[:, :-1])
def test_raw_predict_equals_to_model_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool, eval_set=test_pool) pred = model.predict(test_pool, prediction_type='RawFormulaVal') assert all(model.get_test_eval() == pred)
def test_full_history(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(od_type='Iter', od_wait=20, random_seed=42, approx_on_full_history=True) model.fit(train_pool, eval_set=test_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_pool_after_fit(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(TRAIN_FILE, column_description=CD_FILE) assert _check_data(pool1.get_features(), pool2.get_features()) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool2) assert _check_data(pool1.get_features(), pool2.get_features())
def train_preprocessor(path='.', train='train.csv'): print('start train trash preprocessor...') df = pd.read_csv(os.path.join(path, train)) train_data = df[:-100] validation_data = df[-100: -50] vectorizer = CountVectorizer() x_train_counts = vectorizer.fit_transform(train_data.text) x_validation_counts = vectorizer.transform(validation_data.text) model = CatBoostClassifier(iterations=250, train_dir=path, logging_level='Silent', allow_writing_files=False ) model.fit(X=x_train_counts.toarray(), y=train_data.status, eval_set=(x_validation_counts.toarray(), validation_data.status), use_best_model=True,) model.save_model(os.path.join(path, 'trash_model')) joblib.dump(vectorizer,os.path.join(path, 'trash_vectorizer')) print('end train sentiment preprocessor...')
def cleaning_comments(raw_comments, path='.') -> str: print('start cleaning of comments...') raw = pd.read_csv(raw_comments) cleaned_comments = os.path.join(path, 'cleaned_comments.csv') bad_comments = os.path.join(path, 'bad_comments.csv') model = CatBoostClassifier().load_model(os.path.join(path, 'trash_model')) vectorizer = joblib.load(os.path.join(path, 'trash_vectorizer')) hyp = model.predict_proba(vectorizer.transform(raw.text).toarray()) with open(cleaned_comments, 'w') as cleaned, open(bad_comments, 'w') as bad: bad_file = 'likes,status,text\n' cleaned_file = 'likes,status,text\n' for i in range(len(hyp)): if hyp[i][0] < 0.6: bad_file += str(raw.likes[i]) + ',1,"' + raw.text[i] + '"\n' else: cleaned_file += str(raw.likes[i]) + ',0,"' + raw.text[i] + '"\n' cleaned.write(cleaned_file) bad.write(bad_file) os.remove(raw_comments) print('end cleaning of comments...') return cleaned_comments
def test_predict_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred = model.predict(test_pool, prediction_type="Class") np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros(pool.num_row()) pool.set_baseline(baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_non_ones_weight(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) weight = np.arange(1, pool.num_row()+1) pool.set_weight(weight) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_ntree_limit(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=100, random_seed=0) model.fit(train_pool) pred = model.predict_proba(test_pool, ntree_end=10) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_staged_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool) preds = [] for pred in model.staged_predict(test_pool): preds.append(pred) np.save(PREDS_PATH, np.array(preds)) return local_canonical_file(PREDS_PATH)
def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test): best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8) best.fit(kfold_X_train, y_train) # 对验证集predict pred = best.predict_proba(kfold_X_valid) results = best.predict_proba(test) return pred, results, best
def test_ignored_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3]) model2 = CatBoostClassifier(iterations=5, random_seed=0) model1.fit(train_pool) model2.fit(train_pool) predictions1 = model1.predict(test_pool) predictions2 = model2.predict(test_pool) assert not _check_data(predictions1, predictions2) model1.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def train_catboost_model(df, target, cat_features, params, verbose=True): if not isinstance(df, DataFrame): raise Exception('DataFrame object expected, but got ' + repr(df)) print 'features:', df.columns.tolist() cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features) print 'cat features:', cat_features_index model = CatBoostClassifier(**params) model.fit(df, target, cat_features=cat_features_index, verbose=verbose) return model
def model_1(X,y,test): ''' This is a catBoost model where we need not to encode categorical variables. It automatically takes care of them. ''' categorical_features_indices = np.where(X.dtypes != np.float)[0] X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234) #importing library and building model cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy') cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True) #calculating the class wise prediction probability of cboost model pred_prob=cboost.predict_proba(test) return pred_prob
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
class BesCatBoost: """ catboost_params = { 'iterations': 500, 'depth': 3, 'learning_rate': 0.1, 'eval_metric': 'AUC', 'random_seed': 42, 'logging_level': 'Verbose', 'l2_leaf_reg': 15.0, 'bagging_temperature': 0.75, 'allow_writing_files': False, 'metric_period': 50 } """ def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None): self.params = params self.metric = metric self.maximize = maximize self.verbose = verbose self.model = model def fit(self, X_train, y_train): bst = cv( Pool(X_train, y_train), self.params ) best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1 print('Best Iteration: {}'.format(best_rounds)) self.params['iterations'] = best_rounds self.model = CatBoostClassifier(**self.params) self.model.fit( X_train, y_train ) def predict(self, X_test): pred_prob = self.model.predict_proba(X_test)[:, -1] return pred_prob def feature_importance(self): pass @staticmethod def find_best_params(kag): pass
def test_clone(): estimator = CatBoostClassifier( custom_metric="Accuracy", loss_function="MultiClass", iterations=400) # This is important for sklearn.base.clone since # it uses get_params for cloning estimator. params = estimator.get_params() new_estimator = CatBoostClassifier(**params) new_params = new_estimator.get_params() for param in params: assert param in new_params assert new_params[param] is params[param]
def fit(self, X_train, y_train): bst = cv( Pool(X_train, y_train), self.params ) best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1 print('Best Iteration: {}'.format(best_rounds)) self.params['iterations'] = best_rounds self.model = CatBoostClassifier(**self.params) self.model.fit( X_train, y_train )
def test_custom_objective(): class LoglossObjective(object): def calc_ders_range(self, approxes, targets, weights): assert len(approxes) == len(targets) if weights is not None: assert len(weights) == len(approxes) exponents = [] for index in xrange(len(approxes)): exponents.append(math.exp(approxes[index])) result = [] for index in xrange(len(targets)): p = exponents[index] / (1 + exponents[index]) der1 = (1 - p) if targets[index] > 0.0 else -p der2 = -p * (1 - p) if weights is not None: der1 *= weights[index] der2 *= weights[index] result.append((der1, der2)) return result train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function=LoglossObjective(), eval_metric="Logloss", # Leaf estimation method and gradient iteration are set to match # defaults for Logloss. leaf_estimation_method="Newton", leaf_estimation_iterations=10) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool, prediction_type='RawFormulaVal') model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal') for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_custom_eval(): class LoglossMetric(object): def get_final_error(self, error, weight): return error / (weight + 1e-38) def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) approx = approxes[0] error_sum = 0.0 weight_sum = 0.0 for i in xrange(len(approx)): w = 1.0 if weight is None else weight[i] weight_sum += w error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i]))) return error_sum, weight_sum train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric()) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool) model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool) for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def test_classification_ctr(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter']) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_priors(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, priors=[0, 0.6, 1, 5]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
from catboost import CatBoostClassifier folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=22) predictions = np.zeros((len(X_valid), 2)) oof_preds = np.zeros((len(test_df), 2)) feature_importance_df = pd.DataFrame() final_preds = [] # random_state = [77,89,22,1007,1997,1890,2000,2020,8989,786,787,1999992,2021,7654] for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, y_train.values)): print("Fold {}".format(fold_)) X_trn, y_trn = X_train.iloc[trn_idx], y_train.iloc[trn_idx] X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx] clf = CatBoostClassifier(iterations=10000, depth=3, learning_rate=0.2, eval_metric="Logloss") clf.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], cat_features=cat_cols) final_preds.append(log_loss(y_pred=clf.predict_proba(X_val), y_true=y_val)) predictions += clf.predict_proba(X_valid) oof_preds += clf.predict_proba(test_df) counter = counter + 1 oof_preds = oof_preds / counter sample['risk_flag'] = oof_preds[:, 1] sample['risk_flag'] = sample['risk_flag'].apply(lambda x: 0 if x < 0.5 else 1) sample.to_csv('cat_sfk_loedata.csv', index=False) print(sum(final_preds) / 5)
def test_fit_no_label(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool.get_features())
for fold_n, (train_idx, valid_idx) in enumerate(folds.split(X, y)): X_train = X.iloc[train_idx] y_train = y.iloc[train_idx] X_valid = X.iloc[valid_idx] y_valid = y.iloc[valid_idx] if MODEL_TYPE == "catboost": train_dataset = Pool(data=X_train, label=y_train, cat_features=CAT_FEATURES) valid_dataset = Pool(data=X_valid, label=y_valid, cat_features=CAT_FEATURES) test_dataset = Pool(data=X_test, cat_features=CAT_FEATURES) model = CatBoostClassifier( iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU") model.fit( train_dataset, eval_set=valid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_valid = model.predict(valid_dataset) y_pred = model.predict(test_dataset) fold_importance = pd.DataFrame() fold_importance["feature"] = model.feature_names_
# print(classification_report(pred_lgbm, test_y.values)) # print(accuracy_score(pred_lgbm, test_y.values)) # # model = XGBClassifier(objective='binary:logistic', subsample=0.5, max_depth=7, gamma=0.2, colsample_bytree=0.5, # min_child_weight=5, n_estimators=400) # # model.fit(train_X, train_y) # pred_proba_xgb = model.predict_proba(test_X) # pred_proba_xgb_train = model.predict_proba(train_X) # pred_xgb = model.predict(test_X) # # # print(classification_report(pred_xgb, test_y.values)) # print(accuracy_score(pred_xgb, test_y.values)) model = CatBoostClassifier() model.fit(train_X, train_y) pred_proba_cat = model.predict_proba(test_X) pred_proba_cat_train = model.predict_proba(train_X) pred_cat = model.predict(test_X) print(classification_report(pred_cat, test_y.values)) print(accuracy_score(pred_cat, test_y.values)) # stacked_train = np.swapaxes(np.vstack((pred_proba_lgbm_train[:, 1], pred_proba_xgb_train[:, 1], pred_proba_cat_train[:, 1], np.swapaxes(train_X, 0, 1))), 0, 1) # # stacked_test = np.swapaxes(np.vstack((pred_proba_lgbm[:, 1], pred_proba_xgb[:, 1], pred_proba_cat[:, 1] , np.swapaxes(test_X, 0, 1))), 0, 1) # # # Stacked model # model = CatBoostClassifier() #
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_interaction_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction'))) return local_canonical_file(FIMP_PATH)
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def test_classification_ctr(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter']) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_class_weights(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_ignored_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3]) model2 = CatBoostClassifier(iterations=5, random_seed=0) model1.fit(train_pool) model2.fit(train_pool) predictions1 = model1.predict(test_pool) predictions2 = model2.predict(test_pool) assert not _check_data(predictions1, predictions2) model1.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_feature_importance_off(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False) model.fit(pool) model.feature_importances_
from transformers import T5Tokenizer @contextmanager def suppress_stdout_stderr(): """A context manager that redirects stdout and stderr to devnull""" with open(devnull, 'w') as fnull: with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out: yield (err, out) dmp = diff_match_patch() nlp = spacy.load("en") annotator = errant.load('en') classifier = CatBoostClassifier() classifier.load_model("./models/err_type_classifier.cbm") tokenizer = T5Tokenizer.from_pretrained('t5-base') emb_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') def load_predict_fn(model_path): sess = tf.compat.v1.Session() graph = tf.compat.v1.get_default_graph() dev_name = '/gpu:0' if os.environ.get("MODEL_PLACE") == "cpu": dev_name = '/cpu:0' with tf.device(dev_name): tf.compat.v1.reset_default_graph() sess = tf.compat.v1.Session()
from functions import clean_data_train_test_split warnings.filterwarnings('ignore') font_label={'size': 15} font_title={'weight': 'bold', 'size': 15} x_train_train, y_train_train, x_train_val, y_train_val, x_test, y_test = clean_data_train_test_split() thresholds = np.linspace(0,1,101) class_weight = [3, .2] categorical_features_indices = np.where(x_train_train.dtypes != np.float)[0] modcb=CatBoostClassifier(depth=8, iterations=200, learning_rate=0.05, l2_leaf_reg=30, class_weights=class_weight, use_best_model=True, one_hot_max_size=100, rsm=.5) modcb.fit(x_train_train, y_train_train,cat_features=categorical_features_indices,eval_set=(x_train_val, y_train_val),plot=True) y_test_proba = modcb.predict_proba(x_test)[:,1] FPR, TPR, shresholds = roc_curve(y_test, y_test_proba) func.plotroc(FPR, TPR) precisions, aarates = func.get_prec_aa_prof(thresholds, y_test, y_test_proba) func.plot_prec_aa(precisions, aarates) profs, prof_thresh = func.profit_curve(np.array([[6,-150],[0,0]]), y_test_proba[:10000], np.array(y_test)[:10000]) func.plot_profit_curve(prof_thresh, profs)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
for i in range(4): globals()['X_tr_cat%s' % i],globals()['X_tst_cat%s' % i],globals()['y_tr_cat%s' % i],globals()['y_tst_cat%s' % i] = train_test_split(globals()['train_df1_cat%s' % i].drop(columns = ['risk_flag']),globals()['train_df1_cat%s' % i]['risk_flag'],stratify = globals()['train_df1_cat%s' % i]['risk_flag']) score = {} from catboost import CatBoostClassifier for i in range(4): globals()['catboost%s' % i]=CatBoostClassifier(iterations = 2000,eval_metric = 'F1') globals()['catboost%s' % i].fit(globals()['X_tr_cat%s' % i], globals()['y_tr_cat%s' % i],cat_features=cat_cols,eval_set=(globals()['X_tst_cat%s' % i], globals()['y_tst_cat%s' % i])) score['catboost{}'.format(i)] = roc_auc_score(globals()['y_tst_cat%s' % i],globals()['catboost%s' % i].predict(globals()['X_tst_cat%s' % i])) preds = pd.DataFrame() for i in range(4): preds['catboost{}'.format(i)] = globals()['catboost%s' % i].predict(test_df_cat) counts = pd.DataFrame() counts['1s'] = preds.sum(axis = 1) counts['0s'] = 4 - counts['1s'] sample['risk_flag'] = counts['1s'].apply(lambda x : 1 if x > 3 else 0)
def test_predict_without_fit(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.predict(pool)
def test_wrong_ctr_for_classification(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform']) model.fit(pool)
y_train = train_y.loc[x_train.index,:].values.ravel() x_val = train_X[train_X.kfold == fold] y_val = train_y.loc[x_val.index,:].values.ravel() #Dropping the kfold column from the dataframes x_train.pop("kfold") x_val.pop("kfold") #Creating the model est_1 = ("rf",dispatcher.MODELS["random_forest"]) est_2 = ("svm",dispatcher.MODELS["SVM"]) est_3 = ("lr",dispatcher.MODELS["logistic_regression"]) est_4 = ("catboost",dispatcher.MODELS["catBoost"]) model = ensemble.StackingClassifier(estimators = [est_1,est_2, est_4], final_estimator = CatBoostClassifier(early_stopping_rounds=5, class_weights = {0 : 0.25, 1: 0.75}), n_jobs = 8, verbose = 2) # model = dispatcher.MODELS["random_forest"] model.fit(x_train, y_train) train_pred = model.predict(x_train) val_pred = model.predict(x_val) #Computing the F1 score f1_train = metrics.f1_score(y_train, train_pred) f1_val = metrics.f1_score(y_val, val_pred) avg_f1_train.append(f1_train) avg_f1_val.append(f1_val)
def test_feature_importance_off(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False) model.fit(pool) model.feature_importances_
X, y = hlp.load_train() def save_clf(clf, filename): with open(join(hlp.path_models, filename), 'wb') as dest: pickle.dump(clf, dest) # RANDOM FOREST, ~0.94110 pars_rf = pd.read_csv(join(hlp.path_models, 'pars_rf.csv')).\ to_dict(orient='records')[0] pars_rf['n_estimators'] = 200 pars_rf['random_state'] = pr.rseed pars_rf['class_weight'] = 'balanced' pars_rf['min_samples_split'] = 3 pars_rf['n_jobs'] = -1 rf_final = RandomForestClassifier(**pars_rf) rf_final.fit(X, y) save_clf(rf_final, 'rf.pkl') # MLP, 0.96286 mlp_final = MLPClassifier(**pr.par_mlp) mlp_final.fit(X, y) save_clf(mlp_final, 'mlp.pkl') # CATBOOST ~0.963 cat_final = CatBoostClassifier(**pr.par_cat) cat_final.fit(X, y) cat_final.save_model(join(hlp.path_models, 'catboost.meow')) # final nn is trained in train_models.py
def test_predict_sklearn_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_wrong_params_classifier(): with pytest.raises(CatboostError): CatBoostClassifier(wrong_param=1)
def test_priors(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, ctr_description=["Borders:Prior=0:Prior=0.6:Prior=1:Prior=5", "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
prob = np.zeros(len(train)) test_prob = np.zeros(len(test)) test_data = test[feat0].values for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)): train_data = train.loc[train_index][feat0].values valid_data = train.loc[valid_index][feat0].values model = LGBMClassifier(n_estimators=1000, learning_rate=0.08, num_leaves=15, subsample=0.8, colsample_bytree=0.6, n_jobs=4) model.fit(train_data, train_y.loc[train_index], eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50) prob[valid_index] = model.predict_proba(valid_data)[:, 1] test_prob += model.predict_proba(test_data)[:, 1]/5 train['lgb_prob'] = prob test['lgb_prob'] = test_prob kf = StratifiedKFold(5, True, random_state=1) prob = np.zeros(len(train)) test_prob = np.zeros(len(test)) feat1=list(set(feat0 + cat_feat + ['lgb_prob'])) test_data=test[feat1].values for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)): train_data = train.loc[train_index][feat1] valid_data = train.loc[valid_index][feat1] model = CatBoostClassifier(iterations=1000, learning_rate=0.08, depth=7, cat_features=cat_feat) model.fit(train_data, train_y.loc[train_index], eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50) prob[valid_index] = model.predict_proba(valid_data)[:,1] test_prob += model.predict_proba(test_data)[:,1]/5 test['Label'] = test_prob test[['ID', 'Label']].to_csv('../output/1120_count_rank.csv', index=False)
def test_class_weights(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2]) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
cat_feature_inds = [] for i, c in enumerate(train_feature.columns): if c in ['register_type', 'device_type']: cat_feature_inds.append(i) # print("Cat features are: %s" % [train_feature[ind] for ind in cat_feature_inds]) print('开始训练......') catboost_params = { "iterations": 200, # "learning_rate": 0.09, "loss_function": 'Logloss', "eval_metric": 'AUC', "random_seed": 2018 } cb_model = CatBoostClassifier(**catboost_params) cb_model.fit( X_train, Y_train, eval_set=(X_test, Y_test), cat_features=cat_feature_inds, use_best_model=True, verbose=True, # you can uncomment this for text output # plot = True ) # https://www.kaggle.com/nicapotato/catboost-aggregate-features/code cb_model.save_model('../model/catboost_model.txt') temp = cb_model.predict(X_test) threshold = 0.42
def test_interaction_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction'))) return local_canonical_file(FIMP_PATH)
def test_no_eval_set(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool, use_best_model=True)
print('X_train Shape', x_train.shape) print('X_test shape', x_test.shape) print('y_train Shape', y_train.shape) print('y_test shape', y_test.shape) print('Starting training') lgb_train = gbm.Dataset(x_train, y_train) lgb_eval = gbm.Dataset(x_test, y_test, reference=lgb_train) model = CatBoostClassifier(iterations=1200, learning_rate=0.01, l2_leaf_reg=3.5, depth=8, rsm=0.98, loss_function='CrossEntropy', eval_metric='AUC', use_best_model=True, random_seed=42) model.fit(x_train, y_train, cat_features=[], eval_set=(x_test_2, y_test_2)) # model.fit(x_train, y_train) incorrect_x = [] incorrect_y = [] y_pred = model.predict_proba(x_test_2) print("======>", y_pred) # for i, x_sample in enumerate(x_test_2): # if int(round(y_pred[i])) != y_test_2[i]:
def test_wrong_ctr_for_classification(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(ctr_description=['Borders:5:Uniform']) model.fit(pool)
def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3, averaging='usual', n_jobs=-1): """ A function to train a variety of classification models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns n_splits = folds.n_splits if splits is None else n_folds X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'auc': { 'lgb_metric_name': eval_auc, 'catboost_metric_name': 'AUC', 'sklearn_scoring_function': metrics.roc_auc_score }, } result_dict = {} if averaging == 'usual': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) elif averaging == 'rank': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=n_jobs) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=Logloss) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) if averaging == 'usual': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += y_pred.reshape(-1, 1) elif averaging == 'rank': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += pd.Series(y_pred).rank().values.reshape(-1, 1) if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= n_splits print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') result_dict['feature_importance'] = feature_importance result_dict['top_columns'] = cols return result_dict
all_pred = np.zeros(y_train.shape[0]) for train, test in cv: cnt += 1 trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] trn_w = sample_weight[train] val_w = sample_weight[test] list_idx = df.loc[test].reset_index(drop=True).groupby( 'order_id').apply(lambda x: x.index.values).tolist() clf = CatBoostClassifier(**params) clf.fit( trn_x, trn_y, # sample_weight=trn_w, # eval_sample_weight=[val_w], eval_set=(val_x, val_y), verbose=True, # early_stopping_rounds=150 ) pred = clf.predict_proba(val_x)[:, 1] all_pred[test] = pred _score = log_loss(val_y, pred) _score2 = -roc_auc_score(val_y, pred) _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float))
desc_corp = [' '.join(x) for x in desc_corp] # Creating feature-arrays X = train.iloc[:, 2:-1].values y = train.Is_Response.values X_test = test.iloc[:, 2::].values # Train-validation split from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) # A very naive CatBoost model from catboost import CatBoostClassifier model = CatBoostClassifier(iterations=500, learning_rate=0.2, depth=3, l2_leaf_reg=20, loss_function='Logloss', use_best_model=True) cat_features = [0, 1] model.fit(X_train, y_train, cat_features, eval_set=(X_val, y_val)) preds = model.predict(X_test) preds_val = model.predict(X_val) ## Drawing confusion matrix to check model performance from sklearn.metrics import confusion_matrix confusion_matrix(y_val, preds_val) model.score(X_val, y_val) # XGBoost model from xgboost import XGBClassifier
def test_invalid_loss_classifier(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(loss_function="abcdef") model.fit(pool)