Ejemplo n.º 1
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
   if self.vocab_type == VocabType.CHARACTER:
     encoder = text_encoder.ByteTextEncoder()
   elif self.vocab_type == VocabType.SUBWORD:
     if force_get:
       vocab_filepath = os.path.join(data_dir, self.vocab_filename)
       encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
     else:
       encoder = generator_utils.get_or_generate_vocab_inner(
           data_dir, self.vocab_filename, self.approx_vocab_size,
           self.generate_text_for_vocab(data_dir, tmp_dir),
           max_subtoken_length=self.max_subtoken_length,
           reserved_tokens=(
               text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
   elif self.vocab_type == VocabType.TOKEN:
     vocab_filename = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                             replace_oov=self.oov_token)
   else:
     raise ValueError(
         "Unrecognized VocabType: %s" % str(self.vocab_type))
   return encoder
Ejemplo n.º 2
0
def main(_):
  """Convert a file to examples."""
  if FLAGS.subword_text_encoder_filename:
    encoder = text_encoder.SubwordTextEncoder(
        FLAGS.subword_text_encoder_filename)
  elif FLAGS.token_text_encoder_filename:
    encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename)
  elif FLAGS.byte_text_encoder:
    encoder = text_encoder.ByteTextEncoder()
  else:
    encoder = None
  reader = tf.python_io.tf_record_iterator(FLAGS.input_filename)
  total_sequences = 0
  total_input_tokens = 0
  total_target_tokens = 0
  max_input_length = 0
  max_target_length = 0
  for record in reader:
    x = tf.train.Example()
    x.ParseFromString(record)
    inputs = [int(i) for i in x.features.feature["inputs"].int64_list.value]
    targets = [int(i) for i in x.features.feature["targets"].int64_list.value]
    if FLAGS.print_inputs:
      print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs)
    if FLAGS.print_targets:
      print("TARGETS:\n" + encoder.decode(targets) if encoder else targets)
    total_input_tokens += len(inputs)
    total_target_tokens += len(targets)
    total_sequences += 1
    max_input_length = max(max_input_length, len(inputs))
    max_target_length = max(max_target_length, len(targets))

  tf.logging.info("total_sequences: %d", total_sequences)
  tf.logging.info("total_input_tokens: %d", total_input_tokens)
  tf.logging.info("total_target_tokens: %d", total_target_tokens)
  tf.logging.info("max_input_length: %d", max_input_length)
  tf.logging.info("max_target_length: %d", max_target_length)
Ejemplo n.º 3
0
 def generator(self, data_dir, tmp_dir, is_training):
     # In this test problem, we assume that the data is in tmp_dir/ocr/ in
     # files names 0.png, 0.txt, 1.png, 1.txt and so on until num_examples.
     character_vocab = text_encoder.ByteTextEncoder()
     ocr_dir = os.path.join(tmp_dir, "ocr/")
     num_examples = int(len(os.listdir(ocr_dir)) / 2)
     tf.logging.info("Looking for OCR data in %s." % ocr_dir)
     for i in xrange(num_examples):
         image_filepath = os.path.join(ocr_dir, "%d.png" % i)
         text_filepath = os.path.join(ocr_dir, "%d.txt" % i)
         with tf.gfile.Open(text_filepath, "r") as f:
             label = f.read()
         with tf.gfile.Open(image_filepath, "rb") as f:
             encoded_image_data = f.read()
         # In PNG files width and height are stored in these bytes.
         width, height = struct.unpack(">ii", encoded_image_data[16:24])
         encoded_label = character_vocab.encode(label.strip())
         yield {
             "image/encoded": [encoded_image_data],
             "image/format": ["png"],
             "image/class/label": encoded_label,
             "image/height": [height],
             "image/width": [width]
         }
Ejemplo n.º 4
0
def tabbed_parsing_character_generator(tmp_dir, train):
  """Generate source and target data from a single file."""
  character_vocab = text_encoder.ByteTextEncoder()
  filename = "parsing_%s" % ("train" if train else "dev")
  pair_filepath = os.path.join(tmp_dir, filename + ".pairs")
  return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
Ejemplo n.º 5
0
def parsing_character_generator(tmp_dir, train):
  character_vocab = text_encoder.ByteTextEncoder()
  filename = "parsing_%s" % ("train" if train else "dev")
  text_filepath = os.path.join(tmp_dir, filename + ".text")
  tags_filepath = os.path.join(tmp_dir, filename + ".tags")
  return character_generator(text_filepath, tags_filepath, character_vocab, EOS)
Ejemplo n.º 6
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
     return text_encoder.ByteTextEncoder()
Ejemplo n.º 7
0
 def feature_encoders(self, _):
     return {
         "inputs": text_encoder.TextEncoder(),
         "targets": text_encoder.ByteTextEncoder(),
     }
Ejemplo n.º 8
0
def _default_character_feature_encoders():
    return {
        "inputs": text_encoder.ByteTextEncoder(),
        "targets": text_encoder.ByteTextEncoder(),
    }
 def generator(self, data_dir, tmp_dir, train):
   character_vocab = text_encoder.ByteTextEncoder()
   datasets = _TEXT_SIMPLIFICATION_TRAIN_DATASETS if train else _TEXT_SIMPLIFICATION_TEST_DATASETS
   return character_generator(datasets[0], datasets[1], character_vocab, EOS)
Ejemplo n.º 10
0
 def feature_encoders(self, data_dir):
     encoders = {
         "inputs": text_encoder.ByteTextEncoder(),
         "targets": text_encoder.ByteTextEncoder(),
     }
     return encoders
Ejemplo n.º 11
0
 def feature_encoders(self, _):
     return {
         "waveforms": AudioEncoder(),
         "targets": text_encoder.ByteTextEncoder(),
     }
Ejemplo n.º 12
0
 def feature_encoders(self, data_dir):
     del data_dir
     return {
         "inputs": text_encoder.ByteTextEncoder(num_reserved_ids=0),
         "targets": BinaryClassLabelEncoder(),
     }