Ejemplo n.º 1
0
    def decode(self, ids, strip_extraneous=False):
        atoms = [self.all_symbols[idx] for idx in ids]

        array = bytearray()
        for atom in atoms:
            for pair in list(pairwise_nonverlapping(atom)):
                array.append(int("".join(pair), 16))

        # # TODO: handle possible UnicodeDecodeError
        decoded_atoms = [
            atom for atom in array.decode("utf-8").split(EOW) if atom
        ]

        # EOW_HEX = "e29083"
        # ex_array = [bytearray()]
        # current = ex_array[0]
        # decoded_atoms = []
        # for atom in atoms:
        #     for pair in list(pairwise(atom))[::2]:
        #         current.append(int("".join(pair), 16))
        #     if EOW_HEX in atom:
        #         decoded_atoms.append(current.decode("utf-8"))
        #         current = bytearray()
        #         ex_array.append(current)

        atoms = [remove_meta_symbols(atom) for atom in decoded_atoms]
        res = t2t_tokenizer.decode(atoms)
        return res
Ejemplo n.º 2
0
  def decode(self, subtokens):
    """Converts a sequence of subtoken ids to a native string.

    Args:
      subtokens: a list of integers in the range [0, vocab_size)
    Returns:
      a native string
    """
    return unicode_to_native(
        tokenizer.decode(self._subtoken_ids_to_tokens(subtokens)))
Ejemplo n.º 3
0
  def decode(self, subtokens):
    """Converts a sequence of subtoken ids to a native string.

    Args:
      subtokens: a list of integers in the range [0, vocab_size)
    Returns:
      a native string
    """
    return unicode_to_native(
        tokenizer.decode(self._subtoken_ids_to_tokens(subtokens)))
Ejemplo n.º 4
0
 def decode(self, ids, strip_extraneous=False):
     if strip_extraneous:
         ids = strip_ids(ids, list(range(self._num_reserved_ids or 0)))
     substrings = [self.all_symbols[idx] for idx in ids]
     concat = "".join(substrings)
     split = concat.split(EOW)
     tokens = []
     for subtoken in split:
         if subtoken:
             unescaped_token = remove_meta_symbols(subtoken)
             if unescaped_token:
                 tokens.append(unescaped_token)
     detokenized = t2t_tokenizer.decode(tokens)
     return detokenized
    def decode(self, ids, strip_extraneous=False):
        """Converts a sequence of subtoken ids to a native string.

    Args:
      ids: a list of integers in the range [0, vocab_size)
      strip_extraneous: bool, whether to strip off extraneous tokens
        (EOS and PAD).

    Returns:
      a native string
    """
        if strip_extraneous:
            ids = strip_ids(ids, list(range(self._num_reserved_ids or 0)))
        return unicode_to_native(
            tokenizer.decode(self._subtoken_ids_to_tokens(ids)))
Ejemplo n.º 6
0
  def decode(self, ids, strip_extraneous=False):
    """Converts a sequence of subtoken ids to a native string.

    Args:
      ids: a list of integers in the range [0, vocab_size)
      strip_extraneous: bool, whether to strip off extraneous tokens
        (EOS and PAD).

    Returns:
      a native string
    """
    if strip_extraneous:
      ids = strip_ids(ids, list(range(self._num_reserved_ids or 0)))
    return unicode_to_native(
        tokenizer.decode(self._subtoken_ids_to_tokens(ids)))
Ejemplo n.º 7
0
 def test_invertibility_on_random_strings(self):
     for _ in range(1000):
         s = u"".join(
             six.unichr(random.randint(0, 65535)) for _ in range(10))
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
Ejemplo n.º 8
0
 def test_decode(self):
     self.assertEqual(
         u"Dude - that's so cool.",
         tokenizer.decode(
             [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))
Ejemplo n.º 9
0
 def testInvertibilityOnRandomStrings(self):
     random.seed(123)
     for _ in xrange(1000):
         s = u"".join(
             [unichr(random.randint(0, 65535)) for _ in xrange(10)])
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
Ejemplo n.º 10
0
 def test_invertibility_on_random_strings(self):
   for _ in range(1000):
     s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10))
     self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
Ejemplo n.º 11
0
 def test_decode(self):
   self.assertEqual(
       u"Dude - that's so cool.",
       tokenizer.decode(
           [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))