Example #1
0
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(vocab_filepath)
    return ret
Example #2
0
 def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
     target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
     source_token = text_encoder.SubwordTextEncoder(source_vocab_filename)
     target_token = text_encoder.SubwordTextEncoder(target_vocab_filename)
     return {
         "inputs": source_token,
         "targets": target_token,
     }
Example #3
0
 def feature_encoders(self, data_dir):
   source_vocab_filename = os.path.join(
       data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
   target_vocab_filename = os.path.join(
       data_dir, "ice_target.tokens.vocab.%d" % self.targeted_vocab_size)
   source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
   target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
   return {
       "inputs": source_subtokenizer,
       "targets": target_subtokenizer,
   }
Example #4
0
    def test_load_from_file(self):
        # Test a vocab file with words not wrapped with single quotes
        encoder = text_encoder.SubwordTextEncoder()
        correct_vocab = ["the", "and", "of"]
        vocab = io.StringIO("the\n" "and\n" "of\n")
        encoder._load_from_file_object(vocab)
        self.assertEqual(encoder._all_subtoken_strings, correct_vocab)

        # Test a vocab file with words wrapped in single quotes
        encoder = text_encoder.SubwordTextEncoder()
        vocab = io.StringIO("\"the\"\n" "\"and\"\n" "\"of\"\n")
        encoder._load_from_file_object(vocab)
        self.assertEqual(encoder._all_subtoken_strings, correct_vocab)
Example #5
0
 def feature_encoders(self, data_dir):
   # This vocab file must be present within the data directory.
   vocab_filename = os.path.join(data_dir, "charset_size134.txt")
   return {
       "inputs": text_encoder.TextEncoder(),
       "targets": text_encoder.SubwordTextEncoder(vocab_filename)
   }
Example #6
0
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Read or create vocabulary."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print('Vocab file written to: ' + vocab_filepath)

  if tf.gfile.Exists(vocab_filepath):
    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
    return gs
  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
  gs = text_encoder.SubwordTextEncoder()
  token_counts = tokenizer.corpus_token_counts(
      example_file, corpus_max_lines=1000000)
  gs = gs.build_to_target_size(
      vocab_size, token_counts, min_val=1, max_val=1e3)
  gs.store_to_file(vocab_filepath)
  return gs
Example #7
0
 def feature_encoders(self, data_dir):
     vocab_filename = os.path.join(data_dir, self.vocab_file)
     encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     return {
         "inputs": encoder,
         "targets": text_encoder.ClassLabelEncoder(["neg", "pos"]),
     }
Example #8
0
 def feature_encoders(self, data_dir):
     vocab_filename = os.path.join(
         data_dir, "vocab.endefr.%d" % self.target_vocab_size)
     subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
     return {
         "inputs": text_encoder.TextEncoder(),
         "targets": subtokenizer,
     }
Example #9
0
 def feature_encoders(self, data_dir):
   if self.is_character_level:
     encoder = text_encoder.ByteTextEncoder()
   else:
     vocab_filename = os.path.join(
         data_dir, "vocab.endefr.%d" % self.targeted_vocab_size)
     encoder = text_encoder.SubwordTextEncoder(vocab_filename)
   return {"targets": encoder}
Example #10
0
 def feature_encoders(self, data_dir):
     if self.is_character_level:
         encoder = text_encoder.ByteTextEncoder()
     elif self.use_subword_tokenizer:
         vocab_filename = os.path.join(data_dir, self.vocab_file)
         encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     else:
         vocab_filename = os.path.join(data_dir, self.vocab_file)
         encoder = text_encoder.TokenTextEncoder(vocab_filename)
     if self.has_inputs:
         return {"inputs": encoder, "targets": encoder}
     return {"targets": encoder}
Example #11
0
    def test_reserved_token_chars_not_in_alphabet(self):
        corpus = "dog"
        token_counts = collections.Counter(corpus.split(" "))
        encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
            100, token_counts, 2, 100)
        filename = os.path.join(self.test_temp_dir, "out.voc")
        encoder1.store_to_file(filename)
        encoder2 = text_encoder.SubwordTextEncoder(filename=filename)

        self.assertEqual(encoder1._alphabet, encoder2._alphabet)

        for t in text_encoder.RESERVED_TOKENS:
            for c in t:
                # Verify that encoders can encode all reserved token chars.
                encoder1.encode(c)
                encoder2.encode(c)
Example #12
0
def main(_):
    """Convert a file to examples."""
    if FLAGS.subword_text_encoder_filename:
        encoder = text_encoder.SubwordTextEncoder(
            FLAGS.subword_text_encoder_filename)
    elif FLAGS.token_text_encoder_filename:
        encoder = text_encoder.TokenTextEncoder(
            FLAGS.token_text_encoder_filename)
    elif FLAGS.byte_text_encoder:
        encoder = text_encoder.ByteTextEncoder()
    else:
        encoder = None
    reader = tf.python_io.tf_record_iterator(FLAGS.input_filename)
    total_sequences = 0
    total_input_tokens = 0
    total_target_tokens = 0
    max_input_length = 0
    max_target_length = 0
    for record in reader:
        x = tf.train.Example()
        x.ParseFromString(record)
        inputs = [
            int(i) for i in x.features.feature["inputs"].int64_list.value
        ]
        targets = [
            int(i) for i in x.features.feature["targets"].int64_list.value
        ]
        if FLAGS.print_inputs:
            print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs)
        if FLAGS.print_targets:
            print("TARGETS:\n" +
                  encoder.decode(targets) if encoder else targets)
        total_input_tokens += len(inputs)
        total_target_tokens += len(targets)
        total_sequences += 1
        max_input_length = max(max_input_length, len(inputs))
        max_target_length = max(max_target_length, len(targets))

    tf.logging.info("total_sequences: %d", total_sequences)
    tf.logging.info("total_input_tokens: %d", total_input_tokens)
    tf.logging.info("total_target_tokens: %d", total_target_tokens)
    tf.logging.info("max_input_length: %d", max_input_length)
    tf.logging.info("max_target_length: %d", max_target_length)
Example #13
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
    """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab
Example #14
0
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename)