def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type == VocabType.CHARACTER: encoder = text_encoder.ByteTextEncoder() elif self.vocab_type == VocabType.SUBWORD: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) else: encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.approx_vocab_size, self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=self.max_subtoken_length, reserved_tokens=( text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens)) elif self.vocab_type == VocabType.TOKEN: vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov=self.oov_token) else: raise ValueError( "Unrecognized VocabType: %s" % str(self.vocab_type)) return encoder
def main(_): """Convert a file to examples.""" if FLAGS.subword_text_encoder_filename: encoder = text_encoder.SubwordTextEncoder( FLAGS.subword_text_encoder_filename) elif FLAGS.token_text_encoder_filename: encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename) elif FLAGS.byte_text_encoder: encoder = text_encoder.ByteTextEncoder() else: encoder = None reader = tf.python_io.tf_record_iterator(FLAGS.input_filename) total_sequences = 0 total_input_tokens = 0 total_target_tokens = 0 max_input_length = 0 max_target_length = 0 for record in reader: x = tf.train.Example() x.ParseFromString(record) inputs = [int(i) for i in x.features.feature["inputs"].int64_list.value] targets = [int(i) for i in x.features.feature["targets"].int64_list.value] if FLAGS.print_inputs: print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs) if FLAGS.print_targets: print("TARGETS:\n" + encoder.decode(targets) if encoder else targets) total_input_tokens += len(inputs) total_target_tokens += len(targets) total_sequences += 1 max_input_length = max(max_input_length, len(inputs)) max_target_length = max(max_target_length, len(targets)) tf.logging.info("total_sequences: %d", total_sequences) tf.logging.info("total_input_tokens: %d", total_input_tokens) tf.logging.info("total_target_tokens: %d", total_target_tokens) tf.logging.info("max_input_length: %d", max_input_length) tf.logging.info("max_target_length: %d", max_target_length)
def generator(self, data_dir, tmp_dir, is_training): # In this test problem, we assume that the data is in tmp_dir/ocr/ in # files names 0.png, 0.txt, 1.png, 1.txt and so on until num_examples. character_vocab = text_encoder.ByteTextEncoder() ocr_dir = os.path.join(tmp_dir, "ocr/") num_examples = int(len(os.listdir(ocr_dir)) / 2) tf.logging.info("Looking for OCR data in %s." % ocr_dir) for i in xrange(num_examples): image_filepath = os.path.join(ocr_dir, "%d.png" % i) text_filepath = os.path.join(ocr_dir, "%d.txt" % i) with tf.gfile.Open(text_filepath, "r") as f: label = f.read() with tf.gfile.Open(image_filepath, "rb") as f: encoded_image_data = f.read() # In PNG files width and height are stored in these bytes. width, height = struct.unpack(">ii", encoded_image_data[16:24]) encoded_label = character_vocab.encode(label.strip()) yield { "image/encoded": [encoded_image_data], "image/format": ["png"], "image/class/label": encoded_label, "image/height": [height], "image/width": [width] }
def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_%s" % ("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename + ".pairs") return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
def parsing_character_generator(tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_%s" % ("train" if train else "dev") text_filepath = os.path.join(tmp_dir, filename + ".text") tags_filepath = os.path.join(tmp_dir, filename + ".tags") return character_generator(text_filepath, tags_filepath, character_vocab, EOS)
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): return text_encoder.ByteTextEncoder()
def feature_encoders(self, _): return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), }
def _default_character_feature_encoders(): return { "inputs": text_encoder.ByteTextEncoder(), "targets": text_encoder.ByteTextEncoder(), }
def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _TEXT_SIMPLIFICATION_TRAIN_DATASETS if train else _TEXT_SIMPLIFICATION_TEST_DATASETS return character_generator(datasets[0], datasets[1], character_vocab, EOS)
def feature_encoders(self, data_dir): encoders = { "inputs": text_encoder.ByteTextEncoder(), "targets": text_encoder.ByteTextEncoder(), } return encoders
def feature_encoders(self, _): return { "waveforms": AudioEncoder(), "targets": text_encoder.ByteTextEncoder(), }
def feature_encoders(self, data_dir): del data_dir return { "inputs": text_encoder.ByteTextEncoder(num_reserved_ids=0), "targets": BinaryClassLabelEncoder(), }