Ejemplo n.º 1
0
 def test_encode(self):
   self.assertListEqual(
       [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
       tokenizer.encode(u"Dude - that's so cool."))
   self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
                        tokenizer.encode(u"Łukasz est né en 1981."))
   self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
                        tokenizer.encode(u" Spaces at the ends "))
   self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
   self.assertListEqual([u"two", u". \n", u"lines"],
                        tokenizer.encode(u"two. \nlines"))
Ejemplo n.º 2
0
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(vocab_filepath)
    return ret
Ejemplo n.º 3
0
    def encode(self, raw_text):
        """Converts a native string to a list of subtoken ids.

    Args:
      raw_text: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
        return self._tokens_to_subtoken_ids(
            tokenizer.encode(native_to_unicode(raw_text)))
Ejemplo n.º 4
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
    """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab
Ejemplo n.º 5
0
 def test_invertibility_on_random_strings(self):
   for _ in xrange(1000):
     s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10))
     self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))