Exemple #1
0
    def generator(self, data_dir, tmp_dir, is_training):
        """Generator for lm1b sentences.

    Args:
      data_dir: data dir.
      tmp_dir: tmp dir.
      is_training: a boolean.

    Yields:
      A dictionary {"inputs": [0], "targets": [<subword ids>]}
    """
        _maybe_download_corpus(tmp_dir)
        original_vocab = _original_vocab(tmp_dir)
        files = (_train_data_filenames(tmp_dir)
                 if is_training else [_dev_data_filename(tmp_dir)])
        if self.is_character_level:
            encoder = text_encoder.ByteTextEncoder()
        else:
            vocab_filepath = os.path.join(data_dir, self.vocab_file)
            encoder = _get_or_build_subword_text_encoder(
                tmp_dir, vocab_filepath)
        for filepath in files:
            tf.logging.info("filepath = %s", filepath)
            for line in tf.gfile.Open(filepath):
                tokens = encoder.encode(
                    _replace_oov(original_vocab,
                                 text_encoder.native_to_unicode(line)))
                tokens.append(EOS)
                yield {"inputs": [0], "targets": tokens}
Exemple #2
0
 def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag)
     return character_generator(data_path + ".lang1", data_path + ".lang2",
                                character_vocab, EOS)
Exemple #3
0
 def feature_encoders(self, data_dir):
   if self.is_character_level:
     encoder = text_encoder.ByteTextEncoder()
   else:
     vocab_filename = os.path.join(
         data_dir, "vocab.endefr.%d" % self.targeted_vocab_size)
     encoder = text_encoder.SubwordTextEncoder(vocab_filename)
   return {"targets": encoder}
Exemple #4
0
 def feature_encoders(self, data_dir):
     if self.is_character_level:
         encoder = text_encoder.ByteTextEncoder()
     elif self.use_subword_tokenizer:
         vocab_filename = os.path.join(data_dir, self.vocab_file)
         encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     else:
         vocab_filename = os.path.join(data_dir, self.vocab_file)
         encoder = text_encoder.TokenTextEncoder(vocab_filename)
     if self.has_inputs:
         return {"inputs": encoder, "targets": encoder}
     return {"targets": encoder}
Exemple #5
0
    def testCharacterGenerator(self):
        # Generate a trivial source and target file.
        tmp_dir = self.get_temp_dir()
        (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
        if six.PY2:
            enc_f = lambda s: s
        else:
            enc_f = lambda s: s.encode("utf-8")
        with io.open(tmp_file_path + ".src", "wb") as src_file:
            src_file.write(enc_f("source1\n"))
            src_file.write(enc_f("source2\n"))
        with io.open(tmp_file_path + ".tgt", "wb") as tgt_file:
            tgt_file.write(enc_f("target1\n"))
            tgt_file.write(enc_f("target2\n"))

        # Call character generator on the generated files.
        results_src, results_tgt = [], []
        character_vocab = text_encoder.ByteTextEncoder()
        for dictionary in wmt.character_generator(tmp_file_path + ".src",
                                                  tmp_file_path + ".tgt",
                                                  character_vocab):
            self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"])
            results_src.append(dictionary["inputs"])
            results_tgt.append(dictionary["targets"])

        # Check that the results match the files.
        # First check that the results match the encoded original strings;
        # this is a comparison of integer arrays.
        self.assertEqual(len(results_src), 2)
        self.assertEqual(results_src[0], character_vocab.encode("source1"))
        self.assertEqual(results_src[1], character_vocab.encode("source2"))
        self.assertEqual(results_tgt[0], character_vocab.encode("target1"))
        self.assertEqual(results_tgt[1], character_vocab.encode("target2"))
        # Then decode the results and compare with the original strings;
        # this is a comparison of strings
        self.assertEqual(character_vocab.decode(results_src[0]), "source1")
        self.assertEqual(character_vocab.decode(results_src[1]), "source2")
        self.assertEqual(character_vocab.decode(results_tgt[0]), "target1")
        self.assertEqual(character_vocab.decode(results_tgt[1]), "target2")

        # Clean up.
        os.remove(tmp_file_path + ".src")
        os.remove(tmp_file_path + ".tgt")
        os.remove(tmp_file_path)
Exemple #6
0
def main(_):
    """Convert a file to examples."""
    if FLAGS.subword_text_encoder_filename:
        encoder = text_encoder.SubwordTextEncoder(
            FLAGS.subword_text_encoder_filename)
    elif FLAGS.token_text_encoder_filename:
        encoder = text_encoder.TokenTextEncoder(
            FLAGS.token_text_encoder_filename)
    elif FLAGS.byte_text_encoder:
        encoder = text_encoder.ByteTextEncoder()
    else:
        encoder = None
    reader = tf.python_io.tf_record_iterator(FLAGS.input_filename)
    total_sequences = 0
    total_input_tokens = 0
    total_target_tokens = 0
    max_input_length = 0
    max_target_length = 0
    for record in reader:
        x = tf.train.Example()
        x.ParseFromString(record)
        inputs = [
            int(i) for i in x.features.feature["inputs"].int64_list.value
        ]
        targets = [
            int(i) for i in x.features.feature["targets"].int64_list.value
        ]
        if FLAGS.print_inputs:
            print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs)
        if FLAGS.print_targets:
            print("TARGETS:\n" +
                  encoder.decode(targets) if encoder else targets)
        total_input_tokens += len(inputs)
        total_target_tokens += len(targets)
        total_sequences += 1
        max_input_length = max(max_input_length, len(inputs))
        max_target_length = max(max_target_length, len(targets))

    tf.logging.info("total_sequences: %d", total_sequences)
    tf.logging.info("total_input_tokens: %d", total_input_tokens)
    tf.logging.info("total_target_tokens: %d", total_target_tokens)
    tf.logging.info("max_input_length: %d", max_input_length)
    tf.logging.info("max_target_length: %d", max_target_length)
Exemple #7
0
  def generator(self, data_dir, tmp_dir, train):
    filename = os.path.basename(PTB_URL)
    compressed_filepath = generator_utils.maybe_download(
        tmp_dir, filename, PTB_URL)
    ptb_files = []
    ptb_char_files = []
    with tarfile.open(compressed_filepath, "r:gz") as tgz:
      files = []
      # Selecting only relevant files.
      for m in tgz.getmembers():
        if "ptb" in m.name and ".txt" in m.name:
          if "char" in m.name:
            ptb_char_files += [m.name]
          else:
            ptb_files += [m.name]
          files += [m]

      tgz.extractall(tmp_dir, members=files)

    if self.is_character_level:
      files = ptb_char_files
    else:
      files = ptb_files

    train_file, valid_file = None, None
    for filename in files:
      if "train" in filename:
        train_file = os.path.join(tmp_dir, filename)
      elif "valid" in filename:
        valid_file = os.path.join(tmp_dir, filename)

    assert train_file, "Training file not found"
    assert valid_file, "Validation file not found"

    if self.is_character_level:
      encoder = text_encoder.ByteTextEncoder()
    else:
      encoder = _get_token_encoder(data_dir, self.vocab_file, train_file)

    if train:
      return self._generator(train_file, encoder)
    return self._generator(valid_file, encoder)
Exemple #8
0
 def feature_encoders(self, _):
     return {
         "inputs": text_encoder.TextEncoder(),
         "targets": text_encoder.ByteTextEncoder(),
     }
Exemple #9
0
def tabbed_parsing_character_generator(tmp_dir, train):
  """Generate source and target data from a single file."""
  character_vocab = text_encoder.ByteTextEncoder()
  filename = "parsing_{0}.pairs".format("train" if train else "dev")
  pair_filepath = os.path.join(tmp_dir, filename)
  return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)