Esempio n. 1
0
    def isPositive(self, X=None):
        '''
		判断输入数据是否为正样例
		Args:
			X: 待预测的数据特征样本 [rows, cols]
		Returns:
			is_positive: 1 -> 为正样例,0 -> 为负样例
		'''
        if X is None:
            log.error("Input samples can not be empty.")
            return

        is_exist_stand_d = os.path.exists(self._save_standard_d_path)
        is_exist_stand_c = os.path.exists(self._save_standard_c_path)
        is_exist_gbdt_enc = os.path.exists(self._save_gbdt_enc_path)
        is_exist_gbdt_model = os.path.exists(self._save_gbdt_model_path)
        is_exist_lr_model = os.path.exists(self._save_lr_model_path)
        is_exist_threshold = os.path.exists(self._save_threshold_path)

        model = GbdtLrModel()
        enc_d = None
        enc_c = None
        gbdt_enc = None
        gbdt_model = None
        lr_model = None
        threshold = None

        #未找到模型,重新开始训练模型
        if not is_exist_stand_d or not is_exist_stand_c or \
         not is_exist_gbdt_enc or \
         not is_exist_gbdt_model or \
         not is_exist_lr_model or \
         not is_exist_threshold:
            log.warning(
                "Can not find the path of model file or threshold file.\n"
                "Program will start training model.")
            _hql = "select * from jz_mart_cs.s_union_index_bm where is_sign_order <> 0"
            model, enc_d, enc_c, gbdt_enc, gbdt_model, lr_model, threshold = self._reTraining(
                _hql)
        else:
            #加载模型
            enc_d = pickle.load(open(self._save_standard_d_path, 'rb'))
            enc_c = pickle.load(open(self._save_standard_c_path, 'rb'))
            gbdt_enc = pickle.load(open(self._save_gbdt_enc_path, 'rb'))
            gbdt_model = pickle.load(open(self._save_gbdt_model_path, 'rb'))
            lr_model = pickle.load(open(self._save_lr_model_path, 'rb'))
            threshold = pickle.load(open(self._save_threshold_path, 'rb'))
        pre_module = PreprocModule()
        for X_data_d, X_data_c in self._featuresSplit(pre_module,
                                                      X,
                                                      enc_d=enc_d,
                                                      enc_c=enc_c,
                                                      phase='predict'):
            X_ext = model.combineFeatures(gbdt_model, gbdt_enc, X_data_c,
                                          X_data_d)
            prediction = lr_model.predict_proba(X_ext)[:, 1]
            is_positive = (prediction >= threshold) * 2 - 1
        return is_positive
Esempio n. 2
0
    def _featuresSplit(self,
                       pre_module,
                       X,
                       y=None,
                       enc_d=None,
                       enc_c=None,
                       phase='train'):
        '''
		特征指标分离,划分为连续指标及离散指标
		Args:
			pre_module: 预处理模块
			X: 数据特征集
			y: 数据特征标记
			enc_d: 离散属性特征标准化方法
			enc_c: 连续属性特征标准化方法
			phase: 划分属性的阶段, train 或者 predict
		Returns:
			训练阶段->
			X_train_d: 训练集分离后的离散指标
			X_train_c: 训练集分离后的连续指标
			X_test_d: 测试集分离后的离散指标
			X_test_c: 测试集分离后的连续指标
			y_train: 训练集标记
			y_test: 测试集标记
			enc_d: 离散指标标准化器
			enc_c: 连续指标标准化器
			预测阶段->
			X_d: 离散指标
			X_c: 连续指标
		'''
        X, d_cols, enc_d, enc_c = pre_module.standardization(X, enc_d, enc_c)
        #训练阶段
        if phase == 'train':
            if y is None:
                log.error("Labels can not be empty.")
                return
            #--CV--
            k_cv = StratifiedKFold(n_splits=self._K)
            for train, test in k_cv.split(X, y):
                X_train, X_test, y_train, y_test = X[train], X[test], y[
                    train], y[test]
                X_train_d, X_train_c = np.array([
                    item[:d_cols] for item in X_train
                ]), np.array([item[d_cols:]
                              for item in X_train])  #训练数据集离散指标与连续数值指标分离
                X_test_d, X_test_c = np.array([
                    item[:d_cols] for item in X_test
                ]), np.array([item[d_cols:]
                              for item in X_test])  #测试数据集离散指标与连续数值指
                yield X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test, enc_d, enc_c
        #预测阶段
        elif phase == 'predict':
            X_d, X_c = np.array([item[:d_cols] for item in X
                                 ]), np.array([item[d_cols:] for item in X])
            yield X_d, X_c
Esempio n. 3
0
	def readHive(self, hql):
		'''
		从线上Hive表中读取数据
		'''
		hive_c = HiveClient()
		status, ori_data = hive_c.query(hql)
		if status == 0:
			X, y = [item[4:-1] for item in ori_data if item[-1] != "0"], [int(item[-1]) for item in ori_data if item[-1] != "0"] #前4个指标为展示指标,不参与训练
			X = [[int(float(value)) for value in item] for item in X]
			return X, y
		log.error("Query data from Hive failed.")
		return
Esempio n. 4
0
	def combineFeatures(self, gbdt_model, gbdt_enc, X_data_c=None, X_data_d=None):
		'''
		进行特征的组合
		Args:
			gbdt_model: GBDT模型
			gbdt_enc: GBDT叶子节点OneHotEncoder
			X_data_c: 待组合连续特征
			X_data_d: 待组合离散特征
		Returns:
			X_ext: 组合后的特征
		'''
		if X_data_c is None and X_data_d is None:
			log.error("Feature can not be None.")
			return
		X_ext = None
		if X_data_c is not None:
			X_leaves = gbdt_model.apply(X_data_c)[:,:,0]
			X_ext = gbdt_enc.transform(X_leaves)
		if X_data_d is not None:
			if X_ext is not None:
				X_ext = hstack([X_ext, X_data_d])
		return X_ext
Esempio n. 5
0
    def _reTraining(self, hql=None, offline=False):
        '''
		重新训练模型
		Args:
			hql: 查询数据HQL
			offline: 是否以离线数据进行训练,默认为 False
		Returns:
			model: GbdtLrModel封装模型
			enc_d: 离散属性特征标准化方法
			enc_c: 连续属性特征标准化方法
			gbdt_enc: GBDT OneHotEncoder
			gbdt_model: GBDT模型
			lr_model: LR模型
			threshold: 阈值
		'''
        pre_module = PreprocModule(self._save_standard_d_path,
                                   self._save_standard_c_path)
        if offline:
            path = 'clue-adviser-index-10w.xlsx'
            sheet_name = '查询结果'
            X, y = pre_module.readxlsx(path, sheet_name.decode('utf-8'))
        else:
            if hql is None and len(sys.argv) < 3:
                log.error(
                    'Please give completed parameters.\n'
                    'Example: python finish_rate_predict.py online "select * from <database>.<table_name> [where-clause]"'
                )
                return
            #从命令行中获取
            _hql = ''
            if hql is None:
                _hql = sys.argv[2]
            else:
                _hql = hql
            log.info("Obtain online data from HQL -> %s", _hql)
            X, y = pre_module.readHive(_hql)
        p_rate = pre_module.posRateStatis(y)  #得到正样例的比率
        X, y = np.array(X), np.array(y)
        while p_rate <= 0.2:
            # 需要进行样本不平衡处理
            log.info("Start imbalance process.")
            X, y = pre_module.imbalanceProcess(X, y)
            p_rate = pre_module.posRateStatis(y.tolist())
            log.info("p_rate is: %f", p_rate)
        model = GbdtLrModel(random_state=55)
        max_auc = 0
        for nEstimator in range(80, 101, 10):
            log.debug("***Current n_estimators is: %d", nEstimator)
            _threshold = []
            _auc = []
            _precision = []
            _recall = []
            model.setNEstimators(nEstimator)
            for iter, [
                    X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test,
                    enc_d, enc_c
            ] in enumerate(self._featuresSplit(pre_module, X, y)):
                log.debug("--------Times: [%d]----------", iter + 1)
                gbdt_enc, gbdt_model, lr_model, threshold, auc, precision, recall \
                 = model.buildModel(X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test)
                _threshold.append(threshold)
                _auc.append(auc)
                _precision.append(precision)
                _recall.append(recall)
            avg_threshold = sum(_threshold) / len(_threshold)
            avg_precision = sum(_precision) / len(_precision)
            avg_recall = sum(_recall) / len(_recall)
            avg_auc = sum(_auc) / len(_auc)
            if avg_auc > max_auc:
                opt_gbdt_enc = gbdt_enc
                opt_gbdt_model = gbdt_model
                opt_lr_model = lr_model
                opt_threshold = avg_threshold
                max_auc = avg_auc
            log.info(
                "nEstimators[%d] -> avg_threshold: %f - avg_precision: %f - avg_recall: %f - avg_auc: %f",
                nEstimator, avg_threshold, avg_precision, avg_recall, avg_auc)
        self._saveModel(opt_gbdt_enc, opt_gbdt_model, opt_lr_model,
                        avg_threshold)
        return model, enc_d, enc_c, gbdt_enc, gbdt_model, lr_model, avg_threshold
Esempio n. 6
0
        for X_data_d, X_data_c in self._featuresSplit(pre_module,
                                                      X,
                                                      enc_d=enc_d,
                                                      enc_c=enc_c,
                                                      phase='predict'):
            X_ext = model.combineFeatures(gbdt_model, gbdt_enc, X_data_c,
                                          X_data_d)
            prediction = lr_model.predict_proba(X_ext)[:, 1]
            is_positive = (prediction >= threshold) * 2 - 1
        return is_positive


if __name__ == '__main__':
    if len(sys.argv) < 2:
        log.error(
            "Arguments need greater than 1.\n"
            "Example: python finish_rate_predict.py <offline|online|predict>")
    model = FinishRatePredModel()
    if sys.argv[1] == 'offline':
        #本机训练
        log.info("Start offline training, please wait...")
        model._reTraining(offline=True)
    elif sys.argv[1] == 'online':
        log.info("Start online training, please wait...")
        #服务器上训练
        model._reTraining()
    elif sys.argv[1] == 'predict':
        #离线预测
        pre_module = PreprocModule()
        path = 'clue-adviser-index-predict.xlsx'
        sheet_name = '查询结果'