Exemple #1
0
    def fit(self,x,y,cate_index):
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=345)

        cls = cb.CatBoostClassifier(
            iterations=400,
            od_type='Iter',
            od_wait=50,
            max_depth=5,
            learning_rate=0.1,
            l2_leaf_reg=9,
            random_seed=2019,
            # metric_period=10,
            eval_metric='F1',
            fold_len_multiplier=1.1,
            loss_function='Logloss',
            logging_level='Verbose')
        cls.fit(x_train, y_train, eval_set=(x_test, y_test), cat_features=cate_index)

        def self_metric(y_true, y_pred):
            score = -f1_score(y_true, 1 * (y_pred >= 0.5))
            return 'f1', score, False

        lg = LGBMClassifier(random_seed=2019, n_jobs=-1, objective='binary',
                         learning_rate=0.1, n_estimators=6000, num_leaves=31, max_depth=-1,
                         min_child_samples=50, min_child_weight=9, subsample_freq=1,
                         subsample=0.7, colsample_bytree=0.7,
                            reg_alpha=1, reg_lambda=5)
        lg.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train), (x_test, y_test)],
               early_stopping_rounds=200)
        lg.n_estimators = lg.best_iteration_

        train_prob1 = cls.predict_proba(x_train)[:,1].reshape(-1,1)
        train_prob2 = lg.predict_proba(x_train)[:,1].reshape(-1,1)
        train_prob = np.hstack([train_prob1,train_prob2])
        lr = LogisticRegression(C=10)
        lr.fit(train_prob, y_train)

        train_prob_lr = lr.predict_proba(train_prob)[:,1]
        fpr, tpr, thresholds = roc_curve(y_train, train_prob_lr)
        thre_index = (tpr - fpr).argmax()
        thres = thresholds[thre_index]
        self.thres = thres

        self.m1 = cls
        self.m2 = lg
        self.m3 = lr
def get_model(PARAMS):
    """Get model according to parameters"""
    scale_pos_weight = 577.88  # scale_pos_weight = number of negative samples / number of positive samples
    model = LGBMClassifier()
    model.num_leaves = PARAMS.get("num_leaves")
    model.max_depth = PARAMS.get("max_depth")
    model.n_estimators = 10000
    model.early_stopping_rounds = 20
    model.scale_pos_weight = scale_pos_weight  # we set this parameter to solve the class imbalance problem
    model.objective = "binary"
    model.min_child_weight = PARAMS.get("min_child_weight")
    model.subsample = PARAMS.get("subsample")
    model.subsample_freq = 1
    model.colsample_bytree = PARAMS.get("colsample_bytree")
    model.random_state = 42
    model.n_jobs = -1
    model.max_bin = 63
    model.device = "gpu"
    model.gpu_use_dp = False
    model.gpu_platform_id = 0
    model.gpu_device_id = 0

    return model
Exemple #3
0
    eval_set = [(train_x, train_y), (val_x, val_y)],
    eval_names = ['train', 'val'],
    eval_metric = lgb_f1,
    early_stopping_rounds = 100,
    verbose = 10,
)

print('best score', lgb.best_score_)

# ==============================================================
# 使用全部的 train data 和 调好迭代轮数训练模型,并用 test data 做预测
# ==============================================================
print("=" * 25)
print('predicting')

lgb.n_estimators = lgb.best_iteration_
lgb.fit(all_train_x, all_train_y)
test_y = lgb.predict(test_x)


# ==============================================================
# 创建submission.csv文件
# ==============================================================
print("=" * 25)
print("submission file")
print("=" * 25)

df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis = 1)
df_sub.columns = ['sid', 'label']
df_sub.to_csv('/Users/zfwang/project/mlproj/projects/move_ad_fraud/submission_file/submit-{}.csv' \
    .format(datetime.now().strftime('%m%d_%H%M%S')), sep = ',', index = False)