コード例 #1
0
ファイル: train.py プロジェクト: zhangqifan3/gbdt_lr_lgb
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    traindata_list = FileListGenerator(model_conf['data_dir_train']).generate()
    testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate()

    if model_conf['mode'] == 'train':
        traindata = next(traindata_list)
        tf.logging.info('Start training {}'.format(traindata))
        t0 = time.time()
        train1 = LR(traindata, mode='train').lr_model()
        t1 = time.time()
        tf.logging.info('Finish training {}, take {} mins'.format(
            traindata, float((t1 - t0) / 60)))

    else:
        testdata = next(testdata_list)
        tf.logging.info('Start evaluation {}'.format(testdata))
        t0 = time.time()
        Accuracy, AUC = LR(testdata, mode='pred').lr_model()
        t1 = time.time()
        tf.logging.info('Finish evaluation {}, take {} mins'.format(
            testdata, float((t1 - t0) / 60)))
        print("LR_Accuracy: %f" % Accuracy)
        print("LR_AUC: %f" % AUC)
コード例 #2
0
ファイル: train.py プロジェクト: zhangqifan3/GBDT_LR
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    if model_conf['mode'] == 'train':
        train1 = LR(model_conf['data_dir_train'], mode='train').lr_model()
    else:
        Accuracy, AUC = LR(model_conf['data_dir_pred'], mode='pred').lr_model()
        print("LR_Accuracy: %f" % Accuracy)
        print("LR_AUC: %f" % AUC)
コード例 #3
0
def main():

    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    traindata_list = FileListGenerator(model_conf['data_dir_train']).generate()
    testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate()

    model = build_estimator()

    traindata = next(traindata_list)
    testdata = next(testdata_list)

    t0 = time.time()
    tf.logging.info('Start training {}'.format(traindata))

    model.train(input_fn=lambda: input_fn(traindata, 'train'),
                hooks=None,
                steps=None,
                max_steps=None,
                saving_listeners=None)
    t1 = time.time()
    tf.logging.info('Finish training {}, take {} mins'.format(
        traindata, float((t1 - t0) / 60)))

    tf.logging.info('Start evaluating {}'.format(testdata))
    t2 = time.time()

    results = model.evaluate(
        input_fn=lambda: input_fn(testdata, 'eval'),
        steps=None,  # Number of steps for which to evaluate model.
        hooks=None,
        checkpoint_path=None,  # latest checkpoint in model_dir is used.
        name=None)
    t3 = time.time()
    tf.logging.info('Finish evaluation {}, take {} mins'.format(
        testdata, float((t3 - t2) / 60)))

    # Display evaluation metrics
    for key in sorted(results):
        print('{}: {}'.format(key, results[key]))
コード例 #4
0
ファイル: LR.py プロジェクト: zhangqifan3/GBDT_LR
class LR(object):
    '''
    LR class
    LR模型训练,预测
    '''
    def __init__(self, data_file, mode):
        self._conf = Config()
        self.lr_conf = self._conf.read_model_conf()['lr_conf']
        self._data_file = data_file
        self._mode = mode
        self._gbdt_spr = GBDT_spr(self._data_file)

    def lr_model(self):
        '''
        lr模型训练及预测
        :return: AUC
        '''
        if self._mode == 'train':
            gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode)
            grd_lm = LogisticRegression(penalty=self.lr_conf['penalty'],
                                        solver=self.lr_conf['solver'],
                                        C=float(self.lr_conf['c']))
            grd_lm.fit(gbdt_features, y_label)
            joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m"))

        else:
            gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode)
            grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m"))

            y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1]
            pred_res = grd_lm.predict(gbdt_features)
            accuracy_score = metrics.accuracy_score(y_label, pred_res)

            fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(
                y_label, y_pred_grd_lm)
            roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm)

            AUC_Score = metrics.roc_auc_score(y_label, y_pred_grd_lm)

            return accuracy_score, AUC_Score
コード例 #5
0
def main():
    CONFIG = Config()
    model_conf = CONFIG.read_model_conf()['model_conf']
    model = build_estimator()
    predictions = model.predict(input_fn=lambda: input_fn('/home/leadtek/zhangqifan/reflux_user_pro/data/pred_data/all_data.csv','pred'),
                                predict_keys=None,
                                hooks=None,
                                checkpoint_path=None)  # defaults None to use latest_checkpoint
    res = []
    for pred_dict in predictions:  # dict{probabilities, classes, class_ids}
        opt = []
        class_id = pred_dict['class_ids'][0]
        opt.append(class_id)
        probability = pred_dict['probabilities']
        opt.append(probability[1])
        res.append(opt)
        # print('class_id:',class_id,'probability:',probability)
    res_df = pd.DataFrame(res, columns=['class_id','probability'])
    x = res_df[res_df['class_id'].isin([1])]
    sample = pd.read_csv("/home/leadtek/zhangqifan/reflux_user_pro/data/opt_all_data.csv",sep=' ')
    res_sample = pd.concat([sample,res_df],axis=1)
    res_sample.to_csv(r"/home/leadtek/zhangqifan/reflux_user_pro/res.csv", header=True, index=False,
                                    sep=' ')
コード例 #6
0
ファイル: GBDT.py プロジェクト: zhangqifan3/GBDT_LR
class GBDT_spr(object):
    '''
    GBDT_spr class
    GBDT模型训练,生成离散特征
    '''
    def __init__(self, data_file):
        self._data_file = data_file
        self._DataSet = DataSet(self._data_file)
        self._conf = Config()
        self.dataset = self._DataSet.input_fn()
        self.batch_dataset = self._DataSet.iter_minibatches()
        self._feature_colums = self._feature_colums()
        self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
        self.model_conf = self._conf.read_model_conf()['model_conf']

    def _feature_colums(self):
        '''
        特征列处理
        :return:
            gbdt_colums, type: list
        '''
        gbdt_colums = []
        feature_conf_dic = self._conf.read_feature_conf()[0]
        for feature, conf in feature_conf_dic.items():
            f_type, f_tran = conf["type"], conf["transform"]
            if f_type == 'category':
                if f_tran == 'multivalue':
                    opt = (feature, multivalue())
                    gbdt_colums.append(opt)
                if f_tran == 'one_hot':
                    opt = (feature, one_hot())
                    gbdt_colums.append(opt)

            else:
                opt = ([feature], min_max())
                gbdt_colums.append(opt)
        return gbdt_colums

    def gbdt_model(self, mode):
        '''
        gbdt模型训练,生成离散特征
        :param
            mode: ‘train’ or  ‘pred’
        :return:
            lr_feat:gbdt生成的离散特征
            y:对应数据的label
        '''
        mapper = DataFrameMapper(self._feature_colums, sparse=True)
        if mode == 'train':
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = GradientBoostingClassifier(
                n_estimators=int(self.gbdt_conf['n_estimators']),
                #    random_state=int(self.gbdt_conf['random_state']),
                learning_rate=float(self.gbdt_conf['learning_rate']),
                #    subsample=float(self.gbdt_conf['subsample']),
                min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']),
                max_depth=int(self.gbdt_conf['max_depth']),
                max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']),
                min_samples_split=int(self.gbdt_conf['min_samples_split']))
            if self.model_conf['batch_size'] == '0':
                grd.fit(X, y)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))
                new_feature = grd.apply(X)
                new_feature = new_feature.reshape(
                    -1, int(self.gbdt_conf['n_estimators']))
                enc = OneHotEncoder()
                enc.fit(new_feature)
                lr_feat = np.array(enc.transform(new_feature).toarray())
            else:
                for i, dataset in enumerate(self.batch_dataset):
                    #    print(dataset)
                    batch_X = mapper.fit_transform(dataset)
                    batch_y = list(dataset['label'])
                    grd.fit(batch_X, batch_y)
                    new_feature = grd.apply(batch_X)
                    new_feature = new_feature.reshape(
                        -1, int(self.gbdt_conf['n_estimators']))
                    enc = OneHotEncoder()
                    enc.fit(new_feature)
                    new_feature2 = np.array(
                        enc.transform(new_feature).toarray())
                    print(new_feature2)
                    if i == 0:
                        lr_feat = new_feature2
                    else:
                        lr_feat = np.concatenate([lr_feat, new_feature2],
                                                 axis=0)
                joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m"))

        else:
            X = mapper.fit_transform(self.dataset)
            y = list(self.dataset['label'])
            grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m"))
            new_feature = grd.apply(X)
            new_feature = new_feature.reshape(
                -1, int(self.gbdt_conf['n_estimators']))
            enc = OneHotEncoder()
            enc.fit(new_feature)
            lr_feat = np.array(enc.transform(new_feature).toarray())
        return lr_feat, y
コード例 #7
0
class LR(object):
    '''
    LR class
    LR模型训练,预测
    '''
    def __init__(self, data_file, mode):
        self._conf = Config()
        self._data_file = data_file
        self._Tf_Data = TF_Data(self._data_file)
        self.dataset_train = self._Tf_Data.gbdt_input()
        self.lr_conf = self._conf.read_model_conf()['lr_conf']

        self._mode = mode
        self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode)

    def lr_model(self):
        '''
        lr模型训练及预测
        :return: AUC
        '''
        if self._mode == 'train':
            grd_lm = SGDClassifier(penalty=self.lr_conf['penalty'],
                                   loss='log',
                                   warm_start=True)
            i = 0
            while True:
                try:
                    dataset = next(self._gbdt_spr)
                    batch_X = dataset[0]
                    batch_y = dataset[1]
                    print('start training LR epochs_%d' % i)
                    grd_lm = grd_lm.partial_fit(batch_X,
                                                batch_y,
                                                classes=[0, 1])
                    i += 1
                    del (dataset)
                    del (batch_y)
                    del (batch_X)
                    gc.collect()
                except StopIteration as e:
                    print('Generator return value:', e.value)
                    break
            joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m"))
        else:
            y_all_label = []
            y_all_pred_grd_lm = []
            pred_all_res = []
            grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m"))
            while True:
                try:
                    dataset = next(self._gbdt_spr)
                    gbdt_features = dataset[0]
                    y_label = dataset[1]
                    y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1]
                    pred_res = grd_lm.predict(gbdt_features)
                    y_all_label.extend(y_label)
                    y_all_pred_grd_lm.extend(y_pred_grd_lm)
                    pred_all_res.extend(pred_res)
                    del (dataset)
                    del (gbdt_features)
                    gc.collect()
                except StopIteration as e:
                    print('Generator return value:', e.value)
                    break
            accuracy_score = metrics.accuracy_score(y_all_label, pred_all_res)
            fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(
                y_all_label, y_all_pred_grd_lm)
            roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm)
            AUC_Score = metrics.roc_auc_score(y_all_label, y_all_pred_grd_lm)
            return accuracy_score, AUC_Score
コード例 #8
0
ファイル: GBDT.py プロジェクト: zhangqifan3/gbdt_lr_lgb
class GBDT_spr(object):
    '''
    GBDT_spr class
    GBDT模型训练,生成离散特征
    '''
    def __init__(self, data_file):
        self._data_file = data_file
        self._Tf_Data = TF_Data(self._data_file)
        self._conf = Config()
        self.dataset_train = self._Tf_Data.gbdt_input()
        self.dataset_trans = self._Tf_Data.gbdt_input()
        self.dataset_pred = self._Tf_Data.gbdt_input()
        self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf']
        self.model_conf = self._conf.read_model_conf()['model_conf']

    def gbdt_model(self, mode):
        '''
        gbdt模型训练,生成离散特征
        :param
            mode: ‘train’ or  ‘pred’
        :return:
            transformed_training_matrix:gbdt生成的离散特征
            batch_y:对应数据的label
        '''

        params = {
            'task': 'train',
            'boosting_type': self.gbdt_conf['boosting_type'],
            'objective': 'binary',
            'metric': {'binary_logloss'},
            'num_leaves': int(self.gbdt_conf['num_leaves']),
            # 'num_trees': 60,
            'min_data_in_leaf': int(self.gbdt_conf['min_data_in_leaf']),
            'learning_rate': float(self.gbdt_conf['learning_rate']),
            'feature_fraction': float(self.gbdt_conf['feature_fraction']),
            'bagging_fraction': float(self.gbdt_conf['bagging_fraction']),
            # 'bagging_freq': 5,
            'verbose': -1
        }

        if mode == 'train':
            if self.model_conf['batch_size'] == '0':
                print('TODO')
            else:
                i = 0
                while True:
                    try:
                        dataset = next(self.dataset_train)
                        batch_X = dataset[0]
                        batch_y = dataset[1]
                        lgb_train = lgb.Dataset(batch_X, batch_y)
                        if i == 0:
                            gbm = lgb.train(params,
                                            lgb_train,
                                            valid_sets=lgb_train,
                                            keep_training_booster=True)
                            i += 1
                        else:
                            gbm = lgb.train(
                                params,
                                lgb_train,
                                valid_sets=lgb_train,
                                keep_training_booster=True,
                                init_model='/home/zhangqifan/LightGBM_model.txt'
                            )
                            i += 1
                        gbm.save_model('/home/zhangqifan/LightGBM_model.txt')
                        del (dataset)
                        del (batch_y)
                        del (batch_X)
                        gc.collect()
                    except StopIteration:
                        break

                joblib.dump(gbm, os.path.join(MODEL_DIR, "gbdt_model.m"))

                while True:
                    try:
                        dataset = next(self.dataset_trans)
                        batch_X = dataset[0]
                        batch_y = dataset[1]
                        gbm_trans = joblib.load(
                            os.path.join(MODEL_DIR, "gbdt_model.m"))
                        y_pred = gbm_trans.predict(batch_X, pred_leaf=True)
                        transformed_training_matrix = np.zeros(
                            [
                                len(y_pred),
                                len(y_pred[1]) *
                                int(self.gbdt_conf['num_leaves'])
                            ],
                            dtype=np.int64)  # N * num_tress * num_leafs
                        for m in range(0, len(y_pred)):
                            # temp表示在每棵树上预测的值所在节点的序号(0,64,128,...,6436 为100棵树的序号,中间的值为对应树的节点序号)
                            temp = np.arange(len(y_pred[0])) * int(
                                self.gbdt_conf['num_leaves']) + np.array(
                                    y_pred[m])
                            # 构造one-hot 训练数据集
                            transformed_training_matrix[m][temp] += 1
                        del (dataset)
                        del (batch_X)
                        gc.collect()
                        yield transformed_training_matrix, batch_y
                    except StopIteration:
                        break

        else:
            while True:
                try:
                    dataset = next(self.dataset_pred)
                    gbm_trans = joblib.load(
                        os.path.join(MODEL_DIR, "gbdt_model.m"))
                    batch_X = dataset[0]

                    batch_y = dataset[1]
                    y_pred = gbm_trans.predict(batch_X, pred_leaf=True)
                    transformed_training_matrix = np.zeros(
                        [
                            len(y_pred),
                            len(y_pred[1]) * int(self.gbdt_conf['num_leaves'])
                        ],
                        dtype=np.int64)  # N * num_tress * num_leafs
                    for m in range(0, len(y_pred)):
                        # temp表示在每棵树上预测的值所在节点的序号(0,64,128,...,6436 为100棵树的序号,中间的值为对应树的节点序号)
                        temp = np.arange(len(y_pred[0])) * int(
                            self.gbdt_conf['num_leaves']) + np.array(y_pred[m])
                        # 构造one-hot 训练数据集
                        transformed_training_matrix[m][temp] += 1
                    yield transformed_training_matrix, batch_y
                except StopIteration:
                    break
コード例 #9
0
ファイル: dataset.py プロジェクト: zhangqifan3/GBDT_LR
class DataSet(object):
    '''
    DataSet class
    处理输入数据
    '''
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        '''
        定义输入数据类型,获取数据特征名
        :return:
            all_columns:数据每一列对应的名称 type:list
            csv_defaults:csv默认数据类型 ['feature name': [''],...]
        '''
        features = []
        for i in range(1, len(self._all_features) + 1):
            features.append(self._all_features[str(i)])
        all_columns = ['label'] + features
        csv_defaults = {}
        csv_defaults['label'] = np.int
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = np.int
                    else:
                        csv_defaults[f] = np.str
                else:
                    csv_defaults[f] = np.float
            else:
                csv_defaults[f] = np.str
        return all_columns, csv_defaults

    def iter_minibatches(self):
        '''
        迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行
        :return:
            将输出转化成dataframe输出
        '''

        cur_line_num = 0
        dataset = []
        csvfile = open(self._data_file, 'rt', encoding="utf-8")
        reader = csv.reader(csvfile, delimiter=' ')
        all_columns, csv_defaults = self._csv_defaults
        for line in reader:
            dataset.append(line)
            cur_line_num += 1
            if cur_line_num >= int(self.model_conf['batch_size']):
                dataset = pd.DataFrame(dataset, columns=all_columns)
                dataset = dataset.astype(csv_defaults)
                yield dataset
                dataset = []
                cur_line_num = 0
        dataset = pd.DataFrame(dataset, columns=all_columns)
        dataset = dataset.astype(csv_defaults)
        yield dataset
        csvfile.close()

    def input_fn(self):
        '''
        读取csv文件,转化为dataframe,填充nan值
        :return:
            dataset
        '''
        all_columns, csv_defaults = self._csv_defaults
        dataset = pd.read_csv(self._data_file,
                              sep=' ',
                              names=all_columns,
                              dtype=csv_defaults)
        dataset = dataset.fillna('-')
        return dataset
コード例 #10
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f == 'label':
                csv_defaults['label'] = [0]
            elif f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim='&', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults

        def decode_csv(value):
            parsed_line = tf.decode_csv(value,
                                        record_defaults=list(
                                            csv_defaults.values()),
                                        field_delim=field_delim,
                                        na_value=na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            label = None
            for f in self._all_features.values():
                if f != 'label':
                    if f not in self._feature_used:
                        features.pop(f)
                else:
                    label = features.pop('label')
            return features, label

        return decode_csv

    def input_fn(self, mode):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        if mode == 'train':
            dataset = dataset.repeat(10)
        padding_dic = {k: () for k in self._feature_used}
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']),
                                       padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()
コード例 #11
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _normalizer_fn_builder(self, scaler, normalization_params):
        """normalizer_fn builder"""
        if scaler == 'min_max':

            return lambda x: (x - normalization_params[0]) / (
                max(normalization_params[1] - normalization_params[0], 0.001))
        elif scaler == 'standard':
            return lambda x: (x - normalization_params[0]) / normalization_params[1]
        else:
            return lambda x: tf.log(x)


    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim=' ', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults
        def decode_csv(value):
            parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            for f in self._all_features.values():
                if f not in self._feature_used:
                    features.pop(f)
                    continue
            for f, tensor in features.items():
                if f == 'tag':
                    features[f] = tf.string_split([tensor], ',').values
                if f == 'main_actor':
                    features[f] = tf.string_split([tensor], ',').values
            label = features.pop('label')
            return features, label
        return decode_csv

    def input_fn(self):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        # dataset = dataset.shuffle(10).repeat(1)
        padding_dic = {k: () for k in self._feature_used}
        padding_dic['tag'] = [None]
        # padding_dic['main_actor'] = [None]
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()

    def feat_column(self):
        """
        特征列处理
        :return:
            wide_columns
        """
        wide_columns = []
        wide_dim = 0
        for feature, conf in self._feature_conf_dic.items():
            f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"]
            if feature == 'tag' or feature == 'main_actor':
                col = tf.feature_column.categorical_column_with_vocabulary_file(feature,
                                                                                vocabulary_file=f_param)
                col = tf.feature_column.indicator_column(col)
                wide_columns.append(col)
                wide_dim += int(conf["dim"])
            else:
                if f_type == 'category':
                    if f_tran == 'hash_bucket':
                        hash_bucket_size = int(f_param)
                        col = tf.feature_column.categorical_column_with_hash_bucket(feature,
                                                                                    hash_bucket_size=hash_bucket_size,
                                                                                    dtype=tf.string)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += hash_bucket_size
                    elif f_tran == 'vocab':
                        col = tf.feature_column.categorical_column_with_vocabulary_list(feature,
                                                                                        vocabulary_list=list(map(str, f_param)),
                                                                                        dtype=None,
                                                                                        default_value=-1,
                                                                                        num_oov_buckets=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += len(f_param)
                    elif f_tran == 'identity':
                        num_buckets = f_param
                        col = tf.feature_column.categorical_column_with_identity(feature,
                                                                                 num_buckets=num_buckets,
                                                                                 default_value=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += num_buckets
                else:
                    normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1]))
                    col = tf.feature_column.numeric_column(feature,
                                                           shape=(1,),
                                                           default_value=0,
                                                           dtype=tf.float32)
                                            #               normalizer_fn=normalizer_fn)
             #       col = tf.feature_column.indicator_column(col)
                    wide_columns.append(col)
                    wide_dim += 1
        return wide_columns

    def gbdt_input(self):
        """
        将特征列处理后的数据转化为array输出
        :return:
            process_data:训练或预估数据集; type:array
            label:数据集对应的标签; type:array
        """
        tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column())
        label_element = self.input_fn()[1]
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            while True:
                try:
                    process_data = sess.run(tensor)
                    label = sess.run(label_element)
                    yield process_data, label
                except tf.errors.OutOfRangeError:
                    break