Example #1
0
  def untokenize_agnostic(self, token_list):
    """Turns CuBERT subtokens into whole tokens."""
    # Untokenize agnostic.
    if (not token_list or token_list[-1] != unified_tokenizer.quote_special(
        unified_tokenizer.TokenKind.EOS.name)):
      raise ValueError('Token list %r should end with the EOS token %r.' %
                       (token_list,
                        unified_tokenizer.quote_special(
                            unified_tokenizer.TokenKind.EOS.name)))

    whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens(
        token_list,
        sanitization_mapping=self.mappings,
        sentinel=unified_tokenizer.SENTINEL)
    return whole_tokens
  def untokenize(self, token_list):
    """Untokenizes via `untokenize_abstract`."""
    # Untokenize agnostic.
    if (not token_list or token_list[-1] != quote_special(
        unified_tokenizer.TokenKind.EOS.name)):
      raise ValueError(
          'Token list %r should end with the EOS token %r.' %
          (token_list, quote_special(unified_tokenizer.TokenKind.EOS.name)))

    whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens(
        token_list,
        sanitization_mapping=self.mappings,
        sentinel=unified_tokenizer.SENTINEL)

    return self.untokenize_abstract(whole_tokens)
 def test_reconstitute_raises_when_expected(self, subtokens, mappings):
     with self.assertRaises(ValueError):
         unified_tokenizer.reconstitute_full_unsanitary_tokens(subtokens,
                                                               mappings,
                                                               sentinel='^')
 def test_reconstitute_returns_expected(self, subtokens, mappings,
                                        expected_tokens):
     whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens(
         subtokens, mappings, sentinel='^')
     self.assertSequenceEqual(expected_tokens, whole_tokens)