Esempio n. 1
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Esempio n. 2
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
def gbdt_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_gbdt_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义GBDT模型
        gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5)
        # 训练学习
        gbdt.fit(X_train, y_train)
        y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1]
        gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt)
        print('基于原有特征的gbdt auc: %.5f' % gbdt_auc)
        cv_gbdt_scores.append(gbdt_auc)

        # lr对原始特征样本模型训练
        lr = LogisticRegression()
        lr.fit(X_train, y_train)  # 预测及AUC评测
        y_pred_test = lr.predict_proba(X_valid)[:, 1]
        lr_valid_auc = roc_auc_score(y_valid, y_pred_test)
        print('基于原有特征的LR AUC: %.5f' % lr_valid_auc)
        cv_lr_scores.append(lr_valid_auc)

        # GBDT编码原有特征
        X_train_leaves = gbdt.apply(X_train)[:, :, 0]
        X_valid_leaves = gbdt.apply(X_valid)[:, :, 0]

        # 对所有特征进行ont-hot编码
        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)
        cv_lr_trans_scores.append(gbdt_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_valid_ext = hstack([X_trans[train_rows:, :], X_valid])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
        cv_lr_trans_raw_scores.append(gbdt_lr_auc2)

    cv_lr = np.mean(cv_lr_scores)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_gbdt = np.mean(cv_gbdt_scores)
    print("==" * 20)
    print("gbdt原始特征cv_gbdt:", cv_gbdt)
    print("lr原始特征cv_lr:", cv_lr)
    print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
Esempio n. 4
0
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline):

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3)
    #n_estimators=20, max_depth=3, verbose=0, max_features=0.5

    # 训练学习
    gbdt.fit(train[gbdt_features], train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1]
        gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)
    else:
        y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1]
        gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0]
    X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], train[target])
    
    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1)
    else:
        print('Online')

    # 定义LR模型
    lr = LogisticRegression()
    
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]])
    X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]])
    
    print("gbdt output",X_trans[:train_rows, :].shape)
    print("input",train[lr_features].shape)
    print(X_train_ext.shape)
    
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2)
        print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2)
    else:
        print('Online')
        
        test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1]
        print(test['predicted_score'].head(5))
        print(len(test))
        test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果
        print('Saved result success!')
Esempio n. 5
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    print "train data shape: ", X_train.shape

    # 模型训练
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    print "gbdt leaves shape: ", X_train_leaves.shape
    for i in range(0, len(X_train_leaves[0])):
        cateMap = {}
        for j in range(0, len(X_train_leaves)):
            cateMap[X_train_leaves[j][i]] = 0
        print "F%d: %d" % (i, len(cateMap))

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    gbdtenc = OneHotEncoder(sparse=False, categories='auto')
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print "gbdt oneHot shape: ", X_trans.shape
    print "oneHot leaves: ", X_trans[0]
    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print "gbdt leaves cross", X_train_ext.shape
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Esempio n. 6
0
    # 原始结果为:('test:', 0.081939773937662927)
    # Accuracy : 0.6848
    """
    """
    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, Y_train)  # 预测及AUC评测
    Y_predict_LR = lr.predict_proba(X_test)[:, 1]
    print('test:', log_loss(Y_test, Y_predict_LR))
    print "before Accuracy : %.4f" % metrics.roc_auc_score(Y_test, Y_predict_LR)
    # ('test:', 0.095052240862119497)
    # before Accuracy : 0.5413
    """

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    # apply方法只有gdbt里面才有
    # xgboost里没有。。

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    # print X_trans.shape # (478111, 797)
    """
        # 定义LR模型
        lr = LogisticRegression(n_jobs=1)
        # lr对gbdt特征编码后的样本模型训练
Esempio n. 7
0
     encoder = LabelEncoder()
     data['V2'] = encoder.fit_transform(data['V2'])
     data['V4'] = encoder.fit_transform(data['V4'])
     data['V5'] = encoder.fit_transform(data['V5']) 
data_process(train_agg)
data_process(test_agg)

del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg
#gbdt 构造新特征
gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None)
X_train=train_agg.drop(['USRID','FLAG'],axis=1)
y_train=train_agg['FLAG']
# 训练学习
gbdt.fit(X_train, y_train)
# GBDT编码原有特征
X_train_leaves = gbdt.apply(X_train)[:,:,0]
X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0]
(train_rows, cols) = X_train_leaves.shape
onehot = OneHotEncoder()
X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

# 组合特征
X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray())
X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray())
X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True)
X_test_agg.rename(columns={494: "USRID"},inplace=True)

#训练集和测试集

train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left')
test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left')
Esempio n. 8
0
                                                    test_size=0.25,
                                                    random_state=0)

# 不生成新的特征,直接训练
clf = GradientBoostingClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print("Original featrues")
print("GBDT_ACC: {:.6f}".format(acc))
print("GBDT_AUC: {:.6f}".format(auc))

# 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
X_train_leaves = clf.apply(X_train)[:, :, 0]
X_test_leaves = clf.apply(X_test)[:, :, 0]

# 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作
All_leaves = np.r_[X_train_leaves, X_test_leaves]

# 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作
enc = OneHotEncoder(categories='auto')
new_features = enc.fit_transform(All_leaves)

# 根据原训练集、测试集的索引对新特征予以拆分
train_samples = X_train.shape[0]
X_train_new = new_features[:train_samples, :]
X_test_new = new_features[train_samples:, :]

# 将初始训练集与GBDT新生成的特征联合后再训练LR
Esempio n. 9
0
class Predict():
    def __init__(self):
        self.gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5)
        self.lr = LogisticRegression(n_jobs=-1)
        Train_tab = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        Train_libsvm = [[1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 2, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 4, 1],
                        [3, 3, 2, 1, 5, 2],
                        [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 6, 2], [5, 5, 3, 1, 7, 2], [2, 2, 2, 1, 8, 1],
                        [2, 2, 2, 1, 6, 1],
                        [2, 2, 2, 1, 9, 2], [6, 6, 2, 1, 8, 3], [1, 1, 1, 1, 10, 1], [2, 2, 2, 1, 4, 2],
                        [2, 2, 2, 1, 4, 1],
                        [2, 2, 2, 1, 10, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 12, 1],
                        [2, 2, 2, 1, 2, 1],
                        [5, 5, 3, 1, 13, 2], [2, 2, 2, 1, 14, 1], [7, 7, 2, 1, 15, 2], [1, 1, 1, 1, 16, 1],
                        [1, 1, 1, 1, 8, 1],
                        [1, 1, 1, 1, 17, 1], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 2, 1],
                        [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [5, 5, 3, 1, 15, 2], [5, 5, 3, 1, 21, 2],
                        [2, 2, 2, 1, 21, 1],
                        [1, 1, 1, 1, 22, 1], [6, 6, 2, 1, 5, 2], [2, 2, 2, 1, 1, 2], [8, 8, 2, 1, 15, 3],
                        [4, 4, 3, 1, 23, 2],
                        [9, 9, 2, 2, 6, 2], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 10, 2], [5, 5, 3, 1, 24, 2],
                        [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 8, 1], [5, 5, 3, 1, 2, 2], [6, 6, 2, 1, 3, 3], [1, 1, 1, 1, 19, 1],
                        [2, 2, 2, 1, 12, 2],
                        [2, 2, 2, 1, 25, 1], [1, 1, 1, 1, 2, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 10, 1],
                        [1, 1, 1, 1, 21, 1],
                        [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 9, 1],
                        [2, 2, 2, 1, 20, 2],
                        [2, 2, 2, 1, 4, 2], [1, 1, 1, 1, 4, 1], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1],
                        [2, 2, 2, 1, 4, 2],
                        [2, 2, 2, 1, 23, 1], [5, 5, 3, 1, 13, 2], [3, 3, 2, 1, 22, 2], [2, 2, 2, 1, 11, 2],
                        [2, 2, 2, 1, 1, 2],
                        [2, 2, 2, 1, 9, 1], [1, 1, 1, 1, 9, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 1, 2],
                        [1, 1, 1, 1, 14, 1], [10, 10, 2, 1, 23, 3], [5, 5, 3, 1, 21, 2], [1, 1, 1, 1, 1, 1],
                        [2, 2, 2, 1, 19, 2],
                        [1, 1, 1, 1, 23, 1], [2, 2, 2, 1, 20, 1], [1, 1, 1, 1, 14, 1], [4, 4, 3, 1, 11, 2],
                        [2, 2, 2, 1, 19, 1],
                        [5, 5, 3, 1, 19, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 14, 1], [11, 11, 2, 1, 10, 1],
                        [2, 2, 2, 1, 14, 2],
                        [1, 1, 1, 1, 22, 1], [9, 9, 2, 2, 27, 2], [4, 4, 3, 1, 1, 2], [4, 4, 3, 1, 12, 2],
                        [2, 2, 2, 1, 6, 1],
                        [4, 4, 3, 1, 8, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 28, 1], [2, 2, 2, 1, 15, 2],
                        [1, 1, 1, 1, 3, 1],
                        [2, 2, 2, 1, 14, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 24, 2], [2, 2, 2, 1, 23, 1],
                        [2, 2, 2, 1, 8, 1],
                        [2, 2, 2, 1, 21, 2], [6, 6, 2, 1, 6, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 12, 1],
                        [5, 5, 3, 1, 23, 2],
                        [1, 1, 1, 1, 29, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 2, 2], [1, 1, 1, 1, 8, 1],
                        [1, 1, 1, 1, 30, 1],
                        [2, 2, 2, 1, 8, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 23, 2], [5, 5, 3, 1, 9, 2],
                        [4, 4, 3, 1, 1, 2],
                        [9, 9, 2, 2, 19, 2], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 1, 2], [10, 10, 2, 1, 30, 1],
                        [9, 9, 2, 2, 24, 2],
                        [5, 5, 3, 1, 14, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 22, 2], [2, 2, 2, 1, 26, 1],
                        [2, 2, 2, 1, 14, 1],
                        [2, 2, 2, 1, 1, 1], [4, 4, 3, 1, 2, 2], [3, 3, 2, 1, 29, 2], [2, 2, 2, 1, 6, 2],
                        [2, 2, 2, 1, 9, 2],
                        [2, 2, 2, 1, 16, 2], [5, 5, 3, 1, 13, 2], [13, 13, 2, 1, 3, 2], [2, 2, 2, 1, 27, 1],
                        [2, 2, 2, 1, 1, 2],
                        [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 29, 2], [3, 3, 2, 1, 12, 2],
                        [2, 2, 2, 1, 2, 2],
                        [2, 2, 2, 1, 5, 1], [5, 5, 3, 1, 28, 2], [6, 6, 2, 1, 22, 3], [1, 1, 1, 1, 5, 1],
                        [1, 1, 1, 1, 2, 1],
                        [2, 2, 2, 1, 21, 2], [2, 2, 2, 1, 1, 1], [2, 2, 2, 1, 19, 1], [2, 2, 2, 1, 4, 1],
                        [4, 4, 3, 1, 11, 2],
                        [2, 2, 2, 1, 4, 2], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 18, 1], [1, 1, 1, 1, 23, 1],
                        [9, 9, 2, 2, 25, 2],
                        [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 5, 1], [10, 10, 2, 1, 2, 3], [2, 2, 2, 1, 9, 2],
                        [2, 2, 2, 1, 14, 2],
                        [1, 1, 1, 1, 26, 1], [1, 1, 1, 1, 3, 1], [14, 14, 2, 1, 23, 2], [4, 4, 3, 1, 2, 2],
                        [2, 2, 2, 1, 23, 2]]
        self.gbdt_lr_train(Train_tab, Train_libsvm)

    def gbdt_lr_train(self, Train_tab, Train_libsvm):
        # load样本数据
        X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
        # 训练/测试数据分割
        X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42)
        # 定义GBDT模型
        self.gbdt.fit(X_train, y_train)
        # GBDT编码原有特征
        self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0]
        X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
        # 对所有特征进行ont-hot编码
        (self.train_rows, cols) = self.X_train_leaves.shape
        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
        X_train_ext = hstack([X_trans[:self.train_rows, :], X_train])
        # lr对组合特征的样本模型训练
        self.lr.fit(X_train_ext, y_train)

    def Predict(self, X_test):
        X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
        gbdtenc = OneHotEncoder()
        self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
        X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test])
        y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1]
        values = []
        for value in y_pred_gbdtlr2:
            values.append(value)
        return values
Esempio n. 10
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)
    # X_all_dense = X_all.todense()
    print(type(X_all))
    # print(type(X_all_dense[0]))
    # print(y_all)
    # print("===")

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    # print(X_train)
    # print(y_train)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    toarray = X_test.toarray()
    print(type(toarray))
    y_pred_gbdt = gbdt.predict_proba(toarray)
    # print(y_pred_gbdt)
    y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)  # gbdt auc: 0.96455

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)  # 基于原有特征的LR AUC: 0.93455

    # GBDT编码原有特征
    # X_train_leaves = gbdt.apply(X_train)
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    np.set_printoptions(linewidth=400)
    np.set_printoptions(threshold=np.inf)
    # print(X_train_leaves[0:22,:])  # 打印22行,所有列
    print(type(X_train_leaves))
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    print(train_rows, cols)

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print(X_trans.shape)
    # print(X_trans.todense()[0:22,:])

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    # print(X_trans[train_rows:, :])
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print("组合特征的个数:", X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)