def fit(self,x,y,cate_index): x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=345) cls = cb.CatBoostClassifier( iterations=400, od_type='Iter', od_wait=50, max_depth=5, learning_rate=0.1, l2_leaf_reg=9, random_seed=2019, # metric_period=10, eval_metric='F1', fold_len_multiplier=1.1, loss_function='Logloss', logging_level='Verbose') cls.fit(x_train, y_train, eval_set=(x_test, y_test), cat_features=cate_index) def self_metric(y_true, y_pred): score = -f1_score(y_true, 1 * (y_pred >= 0.5)) return 'f1', score, False lg = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary', learning_rate=0.1, n_estimators=6000, num_leaves=31, max_depth=-1, min_child_samples=50, min_child_weight=9, subsample_freq=1, subsample=0.7, colsample_bytree=0.7, reg_alpha=1, reg_lambda=5) lg.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=200) lg.n_estimators = lg.best_iteration_ train_prob1 = cls.predict_proba(x_train)[:,1].reshape(-1,1) train_prob2 = lg.predict_proba(x_train)[:,1].reshape(-1,1) train_prob = np.hstack([train_prob1,train_prob2]) lr = LogisticRegression(C=10) lr.fit(train_prob, y_train) train_prob_lr = lr.predict_proba(train_prob)[:,1] fpr, tpr, thresholds = roc_curve(y_train, train_prob_lr) thre_index = (tpr - fpr).argmax() thres = thresholds[thre_index] self.thres = thres self.m1 = cls self.m2 = lg self.m3 = lr
def get_model(PARAMS): """Get model according to parameters""" scale_pos_weight = 577.88 # scale_pos_weight = number of negative samples / number of positive samples model = LGBMClassifier() model.num_leaves = PARAMS.get("num_leaves") model.max_depth = PARAMS.get("max_depth") model.n_estimators = 10000 model.early_stopping_rounds = 20 model.scale_pos_weight = scale_pos_weight # we set this parameter to solve the class imbalance problem model.objective = "binary" model.min_child_weight = PARAMS.get("min_child_weight") model.subsample = PARAMS.get("subsample") model.subsample_freq = 1 model.colsample_bytree = PARAMS.get("colsample_bytree") model.random_state = 42 model.n_jobs = -1 model.max_bin = 63 model.device = "gpu" model.gpu_use_dp = False model.gpu_platform_id = 0 model.gpu_device_id = 0 return model
eval_set = [(train_x, train_y), (val_x, val_y)], eval_names = ['train', 'val'], eval_metric = lgb_f1, early_stopping_rounds = 100, verbose = 10, ) print('best score', lgb.best_score_) # ============================================================== # 使用全部的 train data 和 调好迭代轮数训练模型,并用 test data 做预测 # ============================================================== print("=" * 25) print('predicting') lgb.n_estimators = lgb.best_iteration_ lgb.fit(all_train_x, all_train_y) test_y = lgb.predict(test_x) # ============================================================== # 创建submission.csv文件 # ============================================================== print("=" * 25) print("submission file") print("=" * 25) df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis = 1) df_sub.columns = ['sid', 'label'] df_sub.to_csv('/Users/zfwang/project/mlproj/projects/move_ad_fraud/submission_file/submit-{}.csv' \ .format(datetime.now().strftime('%m%d_%H%M%S')), sep = ',', index = False)