Beispiel #1
0
def train_xgb(plot=False):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)
    # kFold cv
    models = []
    scores = []
    checkpoint_predictions = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        params = {
            'learning_rate': .05,
            'n_estimators': 2000,
            'max_depth': 8,
            'min_child_weight': 4,
            'gamma': .2,
            'subsample': .8,
            'colsample_bytree': .8,
            'n_jobs': -1,
            'random_state': 0
        }
        model = XGBClassifier().set_params(**params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50,
                  verbose=False)

        ## plot feature importance
        if plot:
            fscores = pd.Series(model.feature_importances_,
                                X_train.columns).sort_values(ascending=False)
            fscores.plot(kind='bar',
                         title='Feature Importance %d' % i,
                         figsize=(20, 10))
            plt.ylabel('Feature Importance Score')
            plt.show()

        y_pred = model.predict_proba(X_val,
                                     ntree_limit=model.best_ntree_limit)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
Beispiel #2
0
def train_tree(model):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)
    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[tdx], y.loc[vdx]
        y_true = y_val

        # model = RandomForestClassifier()
        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
Beispiel #3
0
def train_cat(plot=False):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)

    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        params = {
            'learning_rate': .05,
            'n_estimators': 2000,
            'max_depth': 8,
            'max_bin': 127,
            'reg_lambda': 2,
            'subsample': .7,
            'one_hot_max_size': 2,
            'bootstrap_type': 'Bernoulli',
            'leaf_estimation_method': 'Newton',
            'random_state': 0
        }
        model = CatBoostClassifier().set_params(**params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50,
                  verbose=False)

        ## plot feature importance
        if plot:
            fscores = pd.Series(model.feature_importances_,
                                X_train.columns).sort_values(ascending=False)
            fscores.plot(kind='bar',
                         title='Feature Importance %d' % i,
                         figsize=(20, 10))
            plt.ylabel('Feature Importance Score')
            plt.show()

        y_pred = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
def model_stacking():
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)

    lgbm = LightGBM.train_lgbm(plot=False)
    print(lgbm.shape)
    cat = CatBoost.train_cat(plot=False)
    print(cat.shape)
    deepFM = DeepFM.train_DeepFM()
    print(deepFM.shape)

    X = pd.concat([lgbm, cat, deepFM], axis=1)
    print(X.shape)

    models = []
    scores = []
    checkpoint_predictions = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % 'Stacking')
    cal_roc_curve(data['y'], data['y_pred'], 'Stacking')

    return data['y_pred']
Beispiel #5
0
    global best_score, best_param
    if oof >= best_score:
        best_score = oof
        print("update best_score: %f" % best_score)
        best_param = params
        print("update best params: %s" % best_param)
    return oof


best_score = -9999
best_param = {}
flag = True
X, y = None, None

if __name__ == '__main__':
    X, y = feature_engineering.get_train_data(use_over_sampler=True)

    # 调参范围
    adj_params = {
        'min_child_samples': (3, 50),
        'feature_fraction': (0.3, 1),
        # 'max_depth': (4, 15),
        'num_leaves': (30, 1300),
        'bagging_fraction': (0.3, 1),
        # 'bagging_freq': (1, 10),
        'lambda_l2': (0.1, 2),
        # 'lambda_l1': (0.1, 1),
        # 'min_data_in_leaf': (1, 150)
    }

    # 调用贝叶斯优化