コード例 #1
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f == 'label':
                csv_defaults['label'] = [0]
            elif f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim='&', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults

        def decode_csv(value):
            parsed_line = tf.decode_csv(value,
                                        record_defaults=list(
                                            csv_defaults.values()),
                                        field_delim=field_delim,
                                        na_value=na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            label = None
            for f in self._all_features.values():
                if f != 'label':
                    if f not in self._feature_used:
                        features.pop(f)
                else:
                    label = features.pop('label')
            return features, label

        return decode_csv

    def input_fn(self, mode):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        if mode == 'train':
            dataset = dataset.repeat(10)
        padding_dic = {k: () for k in self._feature_used}
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']),
                                       padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()
コード例 #2
0
ファイル: dataset.py プロジェクト: zhangqifan3/GBDT_LR
class DataSet(object):
    '''
    DataSet class
    处理输入数据
    '''
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _column_to_csv_defaults(self):
        '''
        定义输入数据类型,获取数据特征名
        :return:
            all_columns:数据每一列对应的名称 type:list
            csv_defaults:csv默认数据类型 ['feature name': [''],...]
        '''
        features = []
        for i in range(1, len(self._all_features) + 1):
            features.append(self._all_features[str(i)])
        all_columns = ['label'] + features
        csv_defaults = {}
        csv_defaults['label'] = np.int
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = np.int
                    else:
                        csv_defaults[f] = np.str
                else:
                    csv_defaults[f] = np.float
            else:
                csv_defaults[f] = np.str
        return all_columns, csv_defaults

    def iter_minibatches(self):
        '''
        迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行
        :return:
            将输出转化成dataframe输出
        '''

        cur_line_num = 0
        dataset = []
        csvfile = open(self._data_file, 'rt', encoding="utf-8")
        reader = csv.reader(csvfile, delimiter=' ')
        all_columns, csv_defaults = self._csv_defaults
        for line in reader:
            dataset.append(line)
            cur_line_num += 1
            if cur_line_num >= int(self.model_conf['batch_size']):
                dataset = pd.DataFrame(dataset, columns=all_columns)
                dataset = dataset.astype(csv_defaults)
                yield dataset
                dataset = []
                cur_line_num = 0
        dataset = pd.DataFrame(dataset, columns=all_columns)
        dataset = dataset.astype(csv_defaults)
        yield dataset
        csvfile.close()

    def input_fn(self):
        '''
        读取csv文件,转化为dataframe,填充nan值
        :return:
            dataset
        '''
        all_columns, csv_defaults = self._csv_defaults
        dataset = pd.read_csv(self._data_file,
                              sep=' ',
                              names=all_columns,
                              dtype=csv_defaults)
        dataset = dataset.fillna('-')
        return dataset
コード例 #3
0
class TF_Data(object):
    def __init__(self, data_file):
        self._conf = Config()
        self._data_file = data_file
        self._feature_conf_dic = self._conf.read_feature_conf()[0]
        self._feature_used = self._conf.read_feature_conf()[1]
        self._all_features = self._conf.read_schema_conf()
        self.model_conf = self._conf.read_model_conf()['model_conf']
        self._csv_defaults = self._column_to_csv_defaults()

    def _normalizer_fn_builder(self, scaler, normalization_params):
        """normalizer_fn builder"""
        if scaler == 'min_max':

            return lambda x: (x - normalization_params[0]) / (
                max(normalization_params[1] - normalization_params[0], 0.001))
        elif scaler == 'standard':
            return lambda x: (x - normalization_params[0]) / normalization_params[1]
        else:
            return lambda x: tf.log(x)


    def _column_to_csv_defaults(self):
        """
        定义csv文件中各个特征默认的数据类型
        :return:
            OrderedDict {'feature name': [''],...}
        """
        csv_defaults = OrderedDict()
        csv_defaults['label'] = [0]
        for f in self._all_features.values():
            if f in self._feature_used:
                conf = self._feature_conf_dic[f]
                if conf['type'] == 'category':
                    if conf['transform'] == 'identity':
                        csv_defaults[f] = [0]
                    else:
                        csv_defaults[f] = ['']
                else:
                    csv_defaults[f] = [0.0]
            else:
                csv_defaults[f] = ['']
        return csv_defaults

    def _parse_csv(self, field_delim=' ', na_value='-'):
        """
        csv数据的解析函数
        :param field_delim: csv字段分隔符
        :param na_value: 使用csv默认值填充na_value
        :return:
            feature dict: {feature: Tensor ... }
        """
        csv_defaults = self._csv_defaults
        def decode_csv(value):
            parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value)
            features = dict(zip(self._csv_defaults.keys(), parsed_line))
            for f in self._all_features.values():
                if f not in self._feature_used:
                    features.pop(f)
                    continue
            for f, tensor in features.items():
                if f == 'tag':
                    features[f] = tf.string_split([tensor], ',').values
                if f == 'main_actor':
                    features[f] = tf.string_split([tensor], ',').values
            label = features.pop('label')
            return features, label
        return decode_csv

    def input_fn(self):
        """
        生成dataset(tensor)
        :return:
            generator
        """
        dataset = tf.data.TextLineDataset(self._data_file)
        dataset = dataset.map(self._parse_csv())  # Decode each line

        # Shuffle, repeat, and batch the examples.
        # dataset = dataset.shuffle(10).repeat(1)
        padding_dic = {k: () for k in self._feature_used}
        padding_dic['tag'] = [None]
        # padding_dic['main_actor'] = [None]
        padded_shapes = (padding_dic, ())
        dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes)

        # Return the read end of the pipeline.
        return dataset.make_one_shot_iterator().get_next()

    def feat_column(self):
        """
        特征列处理
        :return:
            wide_columns
        """
        wide_columns = []
        wide_dim = 0
        for feature, conf in self._feature_conf_dic.items():
            f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"]
            if feature == 'tag' or feature == 'main_actor':
                col = tf.feature_column.categorical_column_with_vocabulary_file(feature,
                                                                                vocabulary_file=f_param)
                col = tf.feature_column.indicator_column(col)
                wide_columns.append(col)
                wide_dim += int(conf["dim"])
            else:
                if f_type == 'category':
                    if f_tran == 'hash_bucket':
                        hash_bucket_size = int(f_param)
                        col = tf.feature_column.categorical_column_with_hash_bucket(feature,
                                                                                    hash_bucket_size=hash_bucket_size,
                                                                                    dtype=tf.string)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += hash_bucket_size
                    elif f_tran == 'vocab':
                        col = tf.feature_column.categorical_column_with_vocabulary_list(feature,
                                                                                        vocabulary_list=list(map(str, f_param)),
                                                                                        dtype=None,
                                                                                        default_value=-1,
                                                                                        num_oov_buckets=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += len(f_param)
                    elif f_tran == 'identity':
                        num_buckets = f_param
                        col = tf.feature_column.categorical_column_with_identity(feature,
                                                                                 num_buckets=num_buckets,
                                                                                 default_value=0)
                        col = tf.feature_column.indicator_column(col)
                        wide_columns.append(col)
                        wide_dim += num_buckets
                else:
                    normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1]))
                    col = tf.feature_column.numeric_column(feature,
                                                           shape=(1,),
                                                           default_value=0,
                                                           dtype=tf.float32)
                                            #               normalizer_fn=normalizer_fn)
             #       col = tf.feature_column.indicator_column(col)
                    wide_columns.append(col)
                    wide_dim += 1
        return wide_columns

    def gbdt_input(self):
        """
        将特征列处理后的数据转化为array输出
        :return:
            process_data:训练或预估数据集; type:array
            label:数据集对应的标签; type:array
        """
        tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column())
        label_element = self.input_fn()[1]
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            while True:
                try:
                    process_data = sess.run(tensor)
                    label = sess.run(label_element)
                    yield process_data, label
                except tf.errors.OutOfRangeError:
                    break