-
Notifications
You must be signed in to change notification settings - Fork 0
/
gbdt_lr_model.py
109 lines (105 loc) · 4.18 KB
/
gbdt_lr_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/python
# encoding:utf-8
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.ranking import roc_auc_score
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.data import OneHotEncoder
from scipy.sparse.construct import hstack
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pickle
import os
import sys
from dj_log import log
class GbdtLrModel():
'''
GBDT+LR模型
'''
def __init__(self, n_estimators=80, gbdt_learning_rate=1, max_depth=1, random_state=None
,recall_rate=0.95):
'''
初始化方法
Args:
n_estimators: GBDT中估计器的个数
gbdt_learning_rate: GBDT学习率
max_depth: GBDT树的最大高度
random_state: 如何设置该值,可以保证相同数据相同参数时训练的结果一致
recall_rate: 模型召回率(查全率)
'''
self._n_estimators = n_estimators
self._gbdt_learning_rate = gbdt_learning_rate
self._max_depth = max_depth
self._random_state = random_state
self._recall_rate = recall_rate
def setNEstimators(self, n_estimators):
'''
设置GBDT中估计器的个数(用于CV中超参数的设置)
'''
self._n_estimators = n_estimators
def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test):
'''
开始构建模型
Args:
X_train_d: 离散特征训练数据
X_train_c: 连续特征训练数据
X_test_d: 离散特征测试数据
X_test_c: 连续特征测试数据
y_train: 训练数据标记 {-1, 1}
y_test: 测试数据标记 {-1, 1}
Returns:
gbc_enc: GBDT OneHotEncoder
gbc: GBDT模型
comb_model: 训练得到的组合模型
threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例
comb_model_auc: 模型AUC
precision: 模型精度
recall: 模型召回率
'''
if self._random_state is not None:
gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train)
else:
gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train)
X_train_leaves = gbc.apply(X_train_c)[:,:,0]
X_test_leaves = gbc.apply(X_test_c)[:,:,0]
(X_train_rows, cols) = X_train_leaves.shape
gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d])
X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d])
log.debug("Combine features done.")
comb_model = LogisticRegression().fit(X_train_ext, y_train)
log.debug("Training done.")
comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred)
ap = average_precision_score(y_test, comb_model_pred)
recall_meet = recall >= self._recall_rate
recall_meet_min = len([item for item in recall_meet if item == True])
threshold = thresholds[recall_meet_min-1]
log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1])
comb_model_auc = roc_auc_score(y_test, comb_model_pred)
log.debug("AUC score is: %f", comb_model_auc)
return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
def combineFeatures(self, gbdt_model, gbdt_enc, X_data_c=None, X_data_d=None):
'''
进行特征的组合
Args:
gbdt_model: GBDT模型
gbdt_enc: GBDT叶子节点OneHotEncoder
X_data_c: 待组合连续特征
X_data_d: 待组合离散特征
Returns:
X_ext: 组合后的特征
'''
if X_data_c is None and X_data_d is None:
log.error("Feature can not be None.")
return
X_ext = None
if X_data_c is not None:
X_leaves = gbdt_model.apply(X_data_c)[:,:,0]
X_ext = gbdt_enc.transform(X_leaves)
if X_data_d is not None:
if X_ext is not None:
X_ext = hstack([X_ext, X_data_d])
return X_ext