class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f == 'label': csv_defaults['label'] = [0] elif f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim='&', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults=list( csv_defaults.values()), field_delim=field_delim, na_value=na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) label = None for f in self._all_features.values(): if f != 'label': if f not in self._feature_used: features.pop(f) else: label = features.pop('label') return features, label return decode_csv def input_fn(self, mode): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. if mode == 'train': dataset = dataset.repeat(10) padding_dic = {k: () for k in self._feature_used} padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next()
class DataSet(object): ''' DataSet class 处理输入数据 ''' def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _column_to_csv_defaults(self): ''' 定义输入数据类型,获取数据特征名 :return: all_columns:数据每一列对应的名称 type:list csv_defaults:csv默认数据类型 ['feature name': [''],...] ''' features = [] for i in range(1, len(self._all_features) + 1): features.append(self._all_features[str(i)]) all_columns = ['label'] + features csv_defaults = {} csv_defaults['label'] = np.int for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = np.int else: csv_defaults[f] = np.str else: csv_defaults[f] = np.float else: csv_defaults[f] = np.str return all_columns, csv_defaults def iter_minibatches(self): ''' 迭代器,给定文件流(比如一个大文件),每次输出minibatch_size行 :return: 将输出转化成dataframe输出 ''' cur_line_num = 0 dataset = [] csvfile = open(self._data_file, 'rt', encoding="utf-8") reader = csv.reader(csvfile, delimiter=' ') all_columns, csv_defaults = self._csv_defaults for line in reader: dataset.append(line) cur_line_num += 1 if cur_line_num >= int(self.model_conf['batch_size']): dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset dataset = [] cur_line_num = 0 dataset = pd.DataFrame(dataset, columns=all_columns) dataset = dataset.astype(csv_defaults) yield dataset csvfile.close() def input_fn(self): ''' 读取csv文件,转化为dataframe,填充nan值 :return: dataset ''' all_columns, csv_defaults = self._csv_defaults dataset = pd.read_csv(self._data_file, sep=' ', names=all_columns, dtype=csv_defaults) dataset = dataset.fillna('-') return dataset
class TF_Data(object): def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults() def _normalizer_fn_builder(self, scaler, normalization_params): """normalizer_fn builder""" if scaler == 'min_max': return lambda x: (x - normalization_params[0]) / ( max(normalization_params[1] - normalization_params[0], 0.001)) elif scaler == 'standard': return lambda x: (x - normalization_params[0]) / normalization_params[1] else: return lambda x: tf.log(x) def _column_to_csv_defaults(self): """ 定义csv文件中各个特征默认的数据类型 :return: OrderedDict {'feature name': [''],...} """ csv_defaults = OrderedDict() csv_defaults['label'] = [0] for f in self._all_features.values(): if f in self._feature_used: conf = self._feature_conf_dic[f] if conf['type'] == 'category': if conf['transform'] == 'identity': csv_defaults[f] = [0] else: csv_defaults[f] = [''] else: csv_defaults[f] = [0.0] else: csv_defaults[f] = [''] return csv_defaults def _parse_csv(self, field_delim=' ', na_value='-'): """ csv数据的解析函数 :param field_delim: csv字段分隔符 :param na_value: 使用csv默认值填充na_value :return: feature dict: {feature: Tensor ... } """ csv_defaults = self._csv_defaults def decode_csv(value): parsed_line = tf.decode_csv(value, record_defaults = list(csv_defaults.values()), field_delim=field_delim, na_value = na_value) features = dict(zip(self._csv_defaults.keys(), parsed_line)) for f in self._all_features.values(): if f not in self._feature_used: features.pop(f) continue for f, tensor in features.items(): if f == 'tag': features[f] = tf.string_split([tensor], ',').values if f == 'main_actor': features[f] = tf.string_split([tensor], ',').values label = features.pop('label') return features, label return decode_csv def input_fn(self): """ 生成dataset(tensor) :return: generator """ dataset = tf.data.TextLineDataset(self._data_file) dataset = dataset.map(self._parse_csv()) # Decode each line # Shuffle, repeat, and batch the examples. # dataset = dataset.shuffle(10).repeat(1) padding_dic = {k: () for k in self._feature_used} padding_dic['tag'] = [None] # padding_dic['main_actor'] = [None] padded_shapes = (padding_dic, ()) dataset = dataset.padded_batch(int(self.model_conf['batch_size']), padded_shapes=padded_shapes) # Return the read end of the pipeline. return dataset.make_one_shot_iterator().get_next() def feat_column(self): """ 特征列处理 :return: wide_columns """ wide_columns = [] wide_dim = 0 for feature, conf in self._feature_conf_dic.items(): f_type, f_tran, f_param = conf["type"], conf["transform"], conf["parameter"] if feature == 'tag' or feature == 'main_actor': col = tf.feature_column.categorical_column_with_vocabulary_file(feature, vocabulary_file=f_param) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += int(conf["dim"]) else: if f_type == 'category': if f_tran == 'hash_bucket': hash_bucket_size = int(f_param) col = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=hash_bucket_size, dtype=tf.string) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += hash_bucket_size elif f_tran == 'vocab': col = tf.feature_column.categorical_column_with_vocabulary_list(feature, vocabulary_list=list(map(str, f_param)), dtype=None, default_value=-1, num_oov_buckets=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += len(f_param) elif f_tran == 'identity': num_buckets = f_param col = tf.feature_column.categorical_column_with_identity(feature, num_buckets=num_buckets, default_value=0) col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += num_buckets else: normalizer_fn = self._normalizer_fn_builder(scaler = f_tran, normalization_params = tuple([0, 1])) col = tf.feature_column.numeric_column(feature, shape=(1,), default_value=0, dtype=tf.float32) # normalizer_fn=normalizer_fn) # col = tf.feature_column.indicator_column(col) wide_columns.append(col) wide_dim += 1 return wide_columns def gbdt_input(self): """ 将特征列处理后的数据转化为array输出 :return: process_data:训练或预估数据集; type:array label:数据集对应的标签; type:array """ tensor = tf.feature_column.input_layer(self.input_fn()[0], self.feat_column()) label_element = self.input_fn()[1] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) while True: try: process_data = sess.run(tensor) label = sess.run(label_element) yield process_data, label except tf.errors.OutOfRangeError: break