def decode(self, ids, strip_extraneous=False): atoms = [self.all_symbols[idx] for idx in ids] array = bytearray() for atom in atoms: for pair in list(pairwise_nonverlapping(atom)): array.append(int("".join(pair), 16)) # # TODO: handle possible UnicodeDecodeError decoded_atoms = [ atom for atom in array.decode("utf-8").split(EOW) if atom ] # EOW_HEX = "e29083" # ex_array = [bytearray()] # current = ex_array[0] # decoded_atoms = [] # for atom in atoms: # for pair in list(pairwise(atom))[::2]: # current.append(int("".join(pair), 16)) # if EOW_HEX in atom: # decoded_atoms.append(current.decode("utf-8")) # current = bytearray() # ex_array.append(current) atoms = [remove_meta_symbols(atom) for atom in decoded_atoms] res = t2t_tokenizer.decode(atoms) return res
def decode(self, subtokens): """Converts a sequence of subtoken ids to a native string. Args: subtokens: a list of integers in the range [0, vocab_size) Returns: a native string """ return unicode_to_native( tokenizer.decode(self._subtoken_ids_to_tokens(subtokens)))
def decode(self, subtokens): """Converts a sequence of subtoken ids to a native string. Args: subtokens: a list of integers in the range [0, vocab_size) Returns: a native string """ return unicode_to_native( tokenizer.decode(self._subtoken_ids_to_tokens(subtokens)))
def decode(self, ids, strip_extraneous=False): if strip_extraneous: ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) substrings = [self.all_symbols[idx] for idx in ids] concat = "".join(substrings) split = concat.split(EOW) tokens = [] for subtoken in split: if subtoken: unescaped_token = remove_meta_symbols(subtoken) if unescaped_token: tokens.append(unescaped_token) detokenized = t2t_tokenizer.decode(tokens) return detokenized
def decode(self, ids, strip_extraneous=False): """Converts a sequence of subtoken ids to a native string. Args: ids: a list of integers in the range [0, vocab_size) strip_extraneous: bool, whether to strip off extraneous tokens (EOS and PAD). Returns: a native string """ if strip_extraneous: ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) return unicode_to_native( tokenizer.decode(self._subtoken_ids_to_tokens(ids)))
def decode(self, ids, strip_extraneous=False): """Converts a sequence of subtoken ids to a native string. Args: ids: a list of integers in the range [0, vocab_size) strip_extraneous: bool, whether to strip off extraneous tokens (EOS and PAD). Returns: a native string """ if strip_extraneous: ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) return unicode_to_native( tokenizer.decode(self._subtoken_ids_to_tokens(ids)))
def test_invertibility_on_random_strings(self): for _ in range(1000): s = u"".join( six.unichr(random.randint(0, 65535)) for _ in range(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
def test_decode(self): self.assertEqual( u"Dude - that's so cool.", tokenizer.decode( [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))
def testInvertibilityOnRandomStrings(self): random.seed(123) for _ in xrange(1000): s = u"".join( [unichr(random.randint(0, 65535)) for _ in xrange(10)]) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
def test_invertibility_on_random_strings(self): for _ in range(1000): s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
def test_decode(self): self.assertEqual( u"Dude - that's so cool.", tokenizer.decode( [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))