# 验证集看一下得分效果
print("auc: {}".format(get_scores(y_valid_catb, y_valid_predict_catb)))

predict = model.predict(test_catb)
test_catb[label_column] = predict
test_catb[label_column] = test_catb[label_column].map(lambda x: 1
                                                      if x >= 0.5 else 0)
test_catb[label_column].to_csv('submit_catb.csv')
print("total costs {} seconds".format(time.time() - start_time))

# ngBoost
import ngboost as ng

model = ng.NGBClassifier(n_estimators=1000,
                         learning_rate=0.01,
                         verbose=True,
                         verbose_eval=200)

model.fit(x_train_ngb, y_train_ngb)

y_valid_predict_ngb = model.predict_proba(x_valid_ngb)[:, 1]
y_valid_predict_ngb = pd.Series(y_valid_predict_ngb)
y_valid_predict_ngb = y_valid_predict_ngb.map(lambda x: 1 if x >= 0.5 else 0)

# 验证集看一下得分效果
print("auc: {}".format(get_scores(y_valid_ngb, y_valid_predict_ngb)))

# predict = model.predict(test_ngb)
predict = model.predict_proba(test_ngb)[:, 1]  # 这里可以直接获取概率值,我们取其为1对应的概率
test_ngb[label_column] = predict
test_ngb[label_column] = test_ngb[label_column].map(lambda x: 1
Exemple #2
0
def main():
    train_0 = pd.read_csv('train.csv')
    test_0 = pd.read_csv('test.csv')
    #print(train_0.head(10))

    header = [
        'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
        'MaritalStatus', 'OverTime'
    ]

    # 删除无用特征
    user_id = test_0['user_id']
    train_0 = train_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1)
    test_0 = test_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1)

    #特征编码
    for index in header:
        LE = LabelEncoder()
        train_0[index] = LE.fit_transform(train_0[index])
        test_0[index] = LE.transform(test_0[index])
    LE = LabelEncoder()
    label_0 = LE.fit_transform(train_0['Attrition'])
    train_0 = train_0.drop(['Attrition'], axis=1)
    train_x, train_y, label_x, label_y = train_test_split(train_0,
                                                          label_0,
                                                          test_size=0.3,
                                                          random_state=1)
    # 标准化

    # LGBM 调参

    parameters = {
        'max_depth': [15, 20, 25],
        'learning_rate': [0.01, 0.05],
        'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
        'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
        'bagging_freq': [2, 4, 5, 6, 8],
        'lambda_l1': [0.6, 0.7, 0.8],
        'lambda_l2': [0, 15, 35],
        'cat_smooth': [1, 10, 15]
    }

    LGB = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metric='auc',
        verbose=0,
        learning_rate=0.01,
        num_leaves=35,
        feature_fraction=0.8,
        bagging_fraction=0.7,
        bagging_freq=2,
        lambda_l1=0.8,
        lambda_l2=0,
        max_depth=15,
        #silent = False
        cat_smooth=1)
    # gsearch = GridSearchCV(LGB, param_grid=parameters, scoring='roc_auc', cv = 3)
    # gsearch.fit(train_0, label_0)
    #
    # print("Best score: %0.3f" % gsearch.best_score_)
    # print("Best parameters set:")
    # best_parameters = gsearch.best_estimator_.get_params()
    # for param_name in sorted(parameters.keys()):
    #     print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # LGB.fit(train_0, label_0)
    # predict = LGB.predict_proba(test_0)[:,1]
    #
    # test_0['Attrition'] = predict
    # test_0['user_id'] = user_id
    # test_0[['user_id','Attrition']].to_csv('submit_lgb.csv', index = False)

    LGB.fit(train_x, label_x)
    predict = LGB.predict_proba(train_y)[:, 1]
    print("LGB auc:%0.6lf" % metrics.roc_auc_score(label_y, predict))

    SVM = SVC(kernel='rbf', probability=True, C=0.2)
    SVM.fit(train_x, label_x)
    predict_svm = SVM.predict_proba(train_y)[:, 1]
    print("SVM auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm))

    CAT = cat.CatBoostClassifier()
    CAT.fit(train_x, label_x)
    predict_svm = CAT.predict_proba(train_y)[:, 1]
    print("cat auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm))

    NG = ng.NGBClassifier()
    NG.fit(train_x, label_x)
    predict_ng = NG.pred_dist(train_y)
    predict_ng = predict_ng.probs[1, :]
    print("NG auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_ng))