def decode(self, ids, strip_extraneous=False): """Converts a sequence of subtoken IDs to a native string. Args: ids: a list of integers in the range [0, vocab_size) strip_extraneous: bool, whether to strip off extraneous tokens (EOS and PAD). Returns: a native string """ if strip_extraneous: ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) return tokenizer.decode(self._subtoken_ids_to_tokens(ids))
def test_invertibility_on_random_strings(self): for _ in range(1000): s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
def test_decode(self): self.assertEqual( u"Dude - that's so cool.", tokenizer.decode( [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))