def test_encode_decode(self):
        corpus = (
            'This is a corpus of text that provides a bunch of tokens from which '
            'to build a vocabulary. It will be used when strings are encoded '
            'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.'
        )
        alphabet = set(corpus) ^ {' '}

        original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'

        encoder = SubwordTextTokenizer.build_to_target_size_from_corpus(
            [corpus, original], target_size=100, min_val=2, max_val=10)

        # Encoding should be reversible.
        encoded = encoder.encode(original)
        decoded = encoder.decode(encoded)
        self.assertEqual(original, decoded)

        # The substrings coded and coder are frequent enough in the corpus that
        # they should appear in the vocabulary even though they are substrings
        # of other included strings.
        subtoken_strings = encoded
        self.assertIn('encoded_', subtoken_strings)
        self.assertIn('coded_', subtoken_strings)
        self.assertIn('SubwordTextTokenizer_', encoder._all_subtoken_strings)
        self.assertIn('coder_', encoder._all_subtoken_strings)

        # Every character in the corpus should be in the encoder's alphabet and
        # its subtoken vocabulary.
        self.assertTrue(alphabet.issubset(encoder._alphabet))
        for a in alphabet:
            self.assertIn(a, encoder._all_subtoken_strings)
    def test_unicode(self):
        corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B'
        token_counts = collections.Counter(corpus.split(' '))

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)

        self.assertIn('\U0001F638', encoder._alphabet)
        self.assertIn('\U0001F63B', encoder._all_subtoken_strings)
Esempio n. 3
0
    def __init__(self,
                 sample,
                 append_sos=False,
                 append_eos=False,
                 target_vocab_size=None,
                 min_occurrences=1,
                 max_occurrences=1e3,
                 reserved_tokens=DEFAULT_RESERVED_TOKENS,
                 sos_index=DEFAULT_SOS_INDEX,
                 eos_index=DEFAULT_EOS_INDEX,
                 unknown_index=DEFAULT_UNKNOWN_INDEX,
                 padding_index=DEFAULT_PADDING_INDEX,
                 **kwargs):
        super().__init__(**kwargs)

        self.append_sos = append_sos
        self.append_eos = append_eos
        self.sos_index = sos_index
        self.eos_index = eos_index
        self.unknown_index = unknown_index
        self.reserved_tokens = reserved_tokens
        self.padding_index = padding_index

        if target_vocab_size is None:
            self.tokenizer = SubwordTextTokenizer()
            self.tokenizer.build_from_corpus(sample, min_count=min_occurrences)
        else:

            target_vocab_size -= len(reserved_tokens)
            self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus(
                sample,
                target_size=target_vocab_size,
                min_val=min_occurrences,
                max_val=max_occurrences)

        self.index_to_token = reserved_tokens.copy()
        self.token_to_index = {token: index for index, token in enumerate(reserved_tokens)}
        for token in self.tokenizer.vocab:
            self.index_to_token.append(token)
            self.token_to_index[token] = len(self.index_to_token) - 1
    def test_raises_exception_when_not_encodable(self):
        corpus = 'the quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))

        # Deliberately exclude some required encoding chars from the alphabet
        # and token list, making some strings unencodable.
        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)
        original = 'This has UPPER CASE letters that are out of alphabet'

        # Previously there was a bug which produced an infinite loop in this case.
        with self.assertRaises(AssertionError):
            encoder.encode(original)
    def test_small_vocab(self):
        corpus = 'The quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))
        alphabet = set(corpus) ^ {' '}

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            10, token_counts, 2, 10)

        # All vocabulary elements are in the alphabet and subtoken strings even
        # if we requested a smaller vocabulary to assure all expected strings
        # are encodable.
        self.assertTrue(alphabet.issubset(encoder._alphabet))
        for a in alphabet:
            self.assertIn(a, encoder._all_subtoken_strings)
    def test_encodable_when_not_in_alphabet(self):
        corpus = 'the quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)
        original = 'This has UPPER CASE letters that are out of alphabet'

        # Early versions could have an infinite loop when breaking into subtokens
        # if there was any out-of-alphabet characters in the encoded string.
        encoded = encoder.encode(original)
        decoded = encoder.decode(encoded)

        self.assertEqual(original, decoded)
        encoded_str = ''.join(encoded)
        self.assertIn('\\84;', encoded_str)
 def test_token_counts(self):
     token_counts = SubwordTextTokenizer._count_tokens(self.corpus)
     expected = {
         u"'": 2,
         u".": 2,
         u". ": 1,
         u"... ": 1,
         u"Groucho": 1,
         u"Marx": 1,
         u"Mitch": 1,
         u"Hedberg": 1,
         u"I": 3,
         u"in": 2,
         u"my": 2,
         u"pajamas": 2,
     }
     self.assertDictContainsSubset(expected, token_counts)
Esempio n. 8
0
class SubwordEncoder(TextEncoder):
    """ Invertibly encoding text using a limited vocabulary.

    Applies Googles Tensor2Tensor ``SubwordTextTokenizer`` that invertibly encodes a native string
    as a
    sequence of subtokens from a limited vocabulary. In order to build the vocabulary, it uses
    recursive binary search to find a minimum token count `x`
    (s.t. `min_occurrences` <= `x` <= `max_occurrences`) that most closely matches the
    `target_size`.

    **Tokenizer Reference:**
    https://github.com/tensorflow/tensor2tensor/blob/8bdecbe434d93cb1e79c0489df20fee2d5a37dc2/tensor2tensor/data_generators/text_encoder.py#L389

    Args:
        sample (list): Sample of data used to build encoding dictionary.
        append_eos (bool, optional): If ``True`` append EOS token onto the end to the encoded
            vector.
        target_vocab_size (int, optional): Desired size of vocab.
        min_occurrences (int, optional): Lower bound for the minimum token count.
        max_occurrences (int, optional): Upper bound for the minimum token count.
        reserved_tokens (list of str, optional): List of reserved tokens inserted in the beginning
            of the dictionary.
        eos_index (int, optional): The eos token is used to encode the end of a sequence. This is
          the index that token resides at.
        unknown_index (int, optional): The unknown token is used to encode unseen tokens. This is
          the index that token resides at.
        padding_index (int, optional): The padding token is used to encode sequence padding. This is
          the index that token resides at.
        **kwargs: Keyword arguments passed onto ``TextEncoder.__init__``.
    """
    def __init__(self,
                 sample,
                 append_eos=False,
                 target_vocab_size=None,
                 min_occurrences=1,
                 max_occurrences=1e3,
                 reserved_tokens=DEFAULT_RESERVED_TOKENS,
                 eos_index=DEFAULT_EOS_INDEX,
                 unknown_index=DEFAULT_UNKNOWN_INDEX,
                 padding_index=DEFAULT_PADDING_INDEX,
                 **kwargs):
        super().__init__(**kwargs)

        self.append_eos = append_eos
        self.eos_index = eos_index
        self.unknown_index = unknown_index
        self.reserved_tokens = reserved_tokens
        self.padding_index = padding_index

        if target_vocab_size is None:
            self.tokenizer = SubwordTextTokenizer()
            self.tokenizer.build_from_corpus(sample, min_count=min_occurrences)
        else:

            target_vocab_size -= len(reserved_tokens)
            self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus(
                sample,
                target_size=target_vocab_size,
                min_val=min_occurrences,
                max_val=max_occurrences)

        self.itos = reserved_tokens.copy()
        self.stoi = {
            token: index
            for index, token in enumerate(reserved_tokens)
        }
        for token in self.tokenizer.vocab:
            self.itos.append(token)
            self.stoi[token] = len(self.itos) - 1

    @property
    def vocab(self):
        """
        Returns:
            list: List of tokens in the dictionary.
        """
        return self.itos

    @property
    def vocab_size(self):
        """
        Returns:
            int: Number of tokens in the dictionary.
        """
        return len(self.vocab)

    def encode(self, sequence):
        """ Encodes a ``sequence``.

        Args:
            sequence (str): String ``sequence`` to encode.

        Returns:
            torch.Tensor: Encoding of the ``sequence``.
        """
        sequence = super().encode(sequence)
        sequence = self.tokenizer.encode(sequence)
        vector = [
            self.stoi.get(token, self.unknown_index) for token in sequence
        ]
        if self.append_eos:
            vector.append(self.eos_index)
        return torch.tensor(vector)

    def decode(self, encoded):
        """ Decodes a tensor into a sequence.

        Args:
            encoded (torch.Tensor): Encoded sequence.

        Returns:
            str: Sequence decoded from ``encoded``.
        """
        encoded = super().decode(encoded)
        return self.tokenizer.decode([self.itos[index] for index in encoded])
def test_is_pickleable():
    tokenizer = SubwordTextTokenizer()
    pickle.dumps(tokenizer)