def isPositive(self, X=None): ''' 判断输入数据是否为正样例 Args: X: 待预测的数据特征样本 [rows, cols] Returns: is_positive: 1 -> 为正样例,0 -> 为负样例 ''' if X is None: log.error("Input samples can not be empty.") return is_exist_stand_d = os.path.exists(self._save_standard_d_path) is_exist_stand_c = os.path.exists(self._save_standard_c_path) is_exist_gbdt_enc = os.path.exists(self._save_gbdt_enc_path) is_exist_gbdt_model = os.path.exists(self._save_gbdt_model_path) is_exist_lr_model = os.path.exists(self._save_lr_model_path) is_exist_threshold = os.path.exists(self._save_threshold_path) model = GbdtLrModel() enc_d = None enc_c = None gbdt_enc = None gbdt_model = None lr_model = None threshold = None #未找到模型,重新开始训练模型 if not is_exist_stand_d or not is_exist_stand_c or \ not is_exist_gbdt_enc or \ not is_exist_gbdt_model or \ not is_exist_lr_model or \ not is_exist_threshold: log.warning( "Can not find the path of model file or threshold file.\n" "Program will start training model.") _hql = "select * from jz_mart_cs.s_union_index_bm where is_sign_order <> 0" model, enc_d, enc_c, gbdt_enc, gbdt_model, lr_model, threshold = self._reTraining( _hql) else: #加载模型 enc_d = pickle.load(open(self._save_standard_d_path, 'rb')) enc_c = pickle.load(open(self._save_standard_c_path, 'rb')) gbdt_enc = pickle.load(open(self._save_gbdt_enc_path, 'rb')) gbdt_model = pickle.load(open(self._save_gbdt_model_path, 'rb')) lr_model = pickle.load(open(self._save_lr_model_path, 'rb')) threshold = pickle.load(open(self._save_threshold_path, 'rb')) pre_module = PreprocModule() for X_data_d, X_data_c in self._featuresSplit(pre_module, X, enc_d=enc_d, enc_c=enc_c, phase='predict'): X_ext = model.combineFeatures(gbdt_model, gbdt_enc, X_data_c, X_data_d) prediction = lr_model.predict_proba(X_ext)[:, 1] is_positive = (prediction >= threshold) * 2 - 1 return is_positive
def _featuresSplit(self, pre_module, X, y=None, enc_d=None, enc_c=None, phase='train'): ''' 特征指标分离,划分为连续指标及离散指标 Args: pre_module: 预处理模块 X: 数据特征集 y: 数据特征标记 enc_d: 离散属性特征标准化方法 enc_c: 连续属性特征标准化方法 phase: 划分属性的阶段, train 或者 predict Returns: 训练阶段-> X_train_d: 训练集分离后的离散指标 X_train_c: 训练集分离后的连续指标 X_test_d: 测试集分离后的离散指标 X_test_c: 测试集分离后的连续指标 y_train: 训练集标记 y_test: 测试集标记 enc_d: 离散指标标准化器 enc_c: 连续指标标准化器 预测阶段-> X_d: 离散指标 X_c: 连续指标 ''' X, d_cols, enc_d, enc_c = pre_module.standardization(X, enc_d, enc_c) #训练阶段 if phase == 'train': if y is None: log.error("Labels can not be empty.") return #--CV-- k_cv = StratifiedKFold(n_splits=self._K) for train, test in k_cv.split(X, y): X_train, X_test, y_train, y_test = X[train], X[test], y[ train], y[test] X_train_d, X_train_c = np.array([ item[:d_cols] for item in X_train ]), np.array([item[d_cols:] for item in X_train]) #训练数据集离散指标与连续数值指标分离 X_test_d, X_test_c = np.array([ item[:d_cols] for item in X_test ]), np.array([item[d_cols:] for item in X_test]) #测试数据集离散指标与连续数值指 yield X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test, enc_d, enc_c #预测阶段 elif phase == 'predict': X_d, X_c = np.array([item[:d_cols] for item in X ]), np.array([item[d_cols:] for item in X]) yield X_d, X_c
def readHive(self, hql): ''' 从线上Hive表中读取数据 ''' hive_c = HiveClient() status, ori_data = hive_c.query(hql) if status == 0: X, y = [item[4:-1] for item in ori_data if item[-1] != "0"], [int(item[-1]) for item in ori_data if item[-1] != "0"] #前4个指标为展示指标,不参与训练 X = [[int(float(value)) for value in item] for item in X] return X, y log.error("Query data from Hive failed.") return
def combineFeatures(self, gbdt_model, gbdt_enc, X_data_c=None, X_data_d=None): ''' 进行特征的组合 Args: gbdt_model: GBDT模型 gbdt_enc: GBDT叶子节点OneHotEncoder X_data_c: 待组合连续特征 X_data_d: 待组合离散特征 Returns: X_ext: 组合后的特征 ''' if X_data_c is None and X_data_d is None: log.error("Feature can not be None.") return X_ext = None if X_data_c is not None: X_leaves = gbdt_model.apply(X_data_c)[:,:,0] X_ext = gbdt_enc.transform(X_leaves) if X_data_d is not None: if X_ext is not None: X_ext = hstack([X_ext, X_data_d]) return X_ext
def _reTraining(self, hql=None, offline=False): ''' 重新训练模型 Args: hql: 查询数据HQL offline: 是否以离线数据进行训练,默认为 False Returns: model: GbdtLrModel封装模型 enc_d: 离散属性特征标准化方法 enc_c: 连续属性特征标准化方法 gbdt_enc: GBDT OneHotEncoder gbdt_model: GBDT模型 lr_model: LR模型 threshold: 阈值 ''' pre_module = PreprocModule(self._save_standard_d_path, self._save_standard_c_path) if offline: path = 'clue-adviser-index-10w.xlsx' sheet_name = '查询结果' X, y = pre_module.readxlsx(path, sheet_name.decode('utf-8')) else: if hql is None and len(sys.argv) < 3: log.error( 'Please give completed parameters.\n' 'Example: python finish_rate_predict.py online "select * from <database>.<table_name> [where-clause]"' ) return #从命令行中获取 _hql = '' if hql is None: _hql = sys.argv[2] else: _hql = hql log.info("Obtain online data from HQL -> %s", _hql) X, y = pre_module.readHive(_hql) p_rate = pre_module.posRateStatis(y) #得到正样例的比率 X, y = np.array(X), np.array(y) while p_rate <= 0.2: # 需要进行样本不平衡处理 log.info("Start imbalance process.") X, y = pre_module.imbalanceProcess(X, y) p_rate = pre_module.posRateStatis(y.tolist()) log.info("p_rate is: %f", p_rate) model = GbdtLrModel(random_state=55) max_auc = 0 for nEstimator in range(80, 101, 10): log.debug("***Current n_estimators is: %d", nEstimator) _threshold = [] _auc = [] _precision = [] _recall = [] model.setNEstimators(nEstimator) for iter, [ X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test, enc_d, enc_c ] in enumerate(self._featuresSplit(pre_module, X, y)): log.debug("--------Times: [%d]----------", iter + 1) gbdt_enc, gbdt_model, lr_model, threshold, auc, precision, recall \ = model.buildModel(X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test) _threshold.append(threshold) _auc.append(auc) _precision.append(precision) _recall.append(recall) avg_threshold = sum(_threshold) / len(_threshold) avg_precision = sum(_precision) / len(_precision) avg_recall = sum(_recall) / len(_recall) avg_auc = sum(_auc) / len(_auc) if avg_auc > max_auc: opt_gbdt_enc = gbdt_enc opt_gbdt_model = gbdt_model opt_lr_model = lr_model opt_threshold = avg_threshold max_auc = avg_auc log.info( "nEstimators[%d] -> avg_threshold: %f - avg_precision: %f - avg_recall: %f - avg_auc: %f", nEstimator, avg_threshold, avg_precision, avg_recall, avg_auc) self._saveModel(opt_gbdt_enc, opt_gbdt_model, opt_lr_model, avg_threshold) return model, enc_d, enc_c, gbdt_enc, gbdt_model, lr_model, avg_threshold
for X_data_d, X_data_c in self._featuresSplit(pre_module, X, enc_d=enc_d, enc_c=enc_c, phase='predict'): X_ext = model.combineFeatures(gbdt_model, gbdt_enc, X_data_c, X_data_d) prediction = lr_model.predict_proba(X_ext)[:, 1] is_positive = (prediction >= threshold) * 2 - 1 return is_positive if __name__ == '__main__': if len(sys.argv) < 2: log.error( "Arguments need greater than 1.\n" "Example: python finish_rate_predict.py <offline|online|predict>") model = FinishRatePredModel() if sys.argv[1] == 'offline': #本机训练 log.info("Start offline training, please wait...") model._reTraining(offline=True) elif sys.argv[1] == 'online': log.info("Start online training, please wait...") #服务器上训练 model._reTraining() elif sys.argv[1] == 'predict': #离线预测 pre_module = PreprocModule() path = 'clue-adviser-index-predict.xlsx' sheet_name = '查询结果'