def __init__(self,
                 sample,
                 append_eos=False,
                 target_vocab_size=None,
                 min_occurrences=1,
                 max_occurrences=1e3,
                 reserved_tokens=RESERVED_ITOS):
        self.append_eos = append_eos

        if target_vocab_size is None:
            self.tokenizer = SubwordTextTokenizer()
            self.tokenizer.build_from_corpus(sample, min_count=min_occurrences)
        else:

            target_vocab_size -= len(reserved_tokens)
            self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus(
                sample,
                target_size=target_vocab_size,
                min_val=min_occurrences,
                max_val=max_occurrences)

        self.itos = reserved_tokens.copy()
        self.stoi = {
            token: index
            for index, token in enumerate(reserved_tokens)
        }
        for token in self.tokenizer.vocab:
            self.itos.append(token)
            self.stoi[token] = len(self.itos) - 1
Exemple #2
0
    def __init__(self,
                 sample,
                 append_eos=False,
                 target_vocab_size=None,
                 min_occurrences=1,
                 max_occurrences=1e3):
        self.append_eos = append_eos

        if target_vocab_size is None:
            self.tokenizer = SubwordTextTokenizer()
            self.tokenizer.build_from_corpus(sample, min_count=min_occurrences)
        else:

            target_vocab_size -= len(RESERVED_ITOS)
            self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus(
                sample,
                target_size=target_vocab_size,
                min_val=min_occurrences,
                max_val=max_occurrences)

        self.stoi = RESERVED_STOI.copy()
        self.itos = RESERVED_ITOS[:]
        for token in self.tokenizer.vocab:
            self.itos.append(token)
            self.stoi[token] = len(self.itos) - 1
Exemple #3
0
class SubwordEncoder(TextEncoder):
    """ Invertibly encoding text using a limited vocabulary.

    Applies Googles Tensor2Tensor SubwordTextTokenizer that invertibly encodes a native string as a
    sequence of subtokens from a limited vocabulary. In order to build the vocabulary, it uses
    recursive binary search to find a minimum token count `x`
    (s.t. `min_occurrences` <= `x` <= `max_occurrences`) that most closely matches the
    `target_size`.

    Tokenization Algorithm Reference:
    https://github.com/tensorflow/tensor2tensor/blob/8bdecbe434d93cb1e79c0489df20fee2d5a37dc2/tensor2tensor/data_generators/text_encoder.py#L389

    Args:
        sample (list of strings): Sample of data to build dictionary on
        append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.
        target_vocab_size (int, optional): Desired size of vocab.
        min_occurrences (int, optional): Lower bound for the minimum token count.
        max_occurrences (int, optional): Upper bound for the minimum token count.
    """
    def __init__(self,
                 sample,
                 append_eos=False,
                 target_vocab_size=None,
                 min_occurrences=1,
                 max_occurrences=1e3):
        self.append_eos = append_eos

        if target_vocab_size is None:
            self.tokenizer = SubwordTextTokenizer()
            self.tokenizer.build_from_corpus(sample, min_count=min_occurrences)
        else:

            target_vocab_size -= len(RESERVED_ITOS)
            self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus(
                sample,
                target_size=target_vocab_size,
                min_val=min_occurrences,
                max_val=max_occurrences)

        self.stoi = RESERVED_STOI.copy()
        self.itos = RESERVED_ITOS[:]
        for token in self.tokenizer.vocab:
            self.itos.append(token)
            self.stoi[token] = len(self.itos) - 1

    @property
    def vocab(self):
        return self.itos

    def encode(self, text):
        text = self.tokenizer.encode(text)
        vector = [self.stoi.get(token, UNKNOWN_INDEX) for token in text]
        if self.append_eos:
            vector.append(EOS_INDEX)
        return torch.LongTensor(vector)

    def decode(self, tensor):
        tokens = [self.itos[index] for index in tensor]
        return self.tokenizer.decode(tokens)
    def test_encode_decode(self):
        corpus = (
            'This is a corpus of text that provides a bunch of tokens from which '
            'to build a vocabulary. It will be used when strings are encoded '
            'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.'
        )
        alphabet = set(corpus) ^ {' '}

        original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'

        encoder = SubwordTextTokenizer.build_to_target_size_from_corpus(
            [corpus, original], target_size=100, min_val=2, max_val=10)

        # Encoding should be reversible.
        encoded = encoder.encode(original)
        decoded = encoder.decode(encoded)
        self.assertEqual(original, decoded)

        # The substrings coded and coder are frequent enough in the corpus that
        # they should appear in the vocabulary even though they are substrings
        # of other included strings.
        subtoken_strings = encoded
        self.assertIn('encoded_', subtoken_strings)
        self.assertIn('coded_', subtoken_strings)
        self.assertIn('SubwordTextTokenizer_', encoder._all_subtoken_strings)
        self.assertIn('coder_', encoder._all_subtoken_strings)

        # Every character in the corpus should be in the encoder's alphabet and
        # its subtoken vocabulary.
        self.assertTrue(alphabet.issubset(encoder._alphabet))
        for a in alphabet:
            self.assertIn(a, encoder._all_subtoken_strings)
    def test_unicode(self):
        corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B'
        token_counts = collections.Counter(corpus.split(' '))

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)

        self.assertIn('\U0001F638', encoder._alphabet)
        self.assertIn('\U0001F63B', encoder._all_subtoken_strings)
    def test_raises_exception_when_not_encodable(self):
        corpus = 'the quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))

        # Deliberately exclude some required encoding chars from the alphabet
        # and token list, making some strings unencodable.
        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)
        original = 'This has UPPER CASE letters that are out of alphabet'

        # Previously there was a bug which produced an infinite loop in this case.
        with self.assertRaises(AssertionError):
            encoder.encode(original)
    def test_small_vocab(self):
        corpus = 'The quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))
        alphabet = set(corpus) ^ {' '}

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            10, token_counts, 2, 10)

        # All vocabulary elements are in the alphabet and subtoken strings even
        # if we requested a smaller vocabulary to assure all expected strings
        # are encodable.
        self.assertTrue(alphabet.issubset(encoder._alphabet))
        for a in alphabet:
            self.assertIn(a, encoder._all_subtoken_strings)
    def test_encodable_when_not_in_alphabet(self):
        corpus = 'the quick brown fox jumps over the lazy dog'
        token_counts = collections.Counter(corpus.split(' '))

        encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts(
            100, token_counts, 2, 10)
        original = 'This has UPPER CASE letters that are out of alphabet'

        # Early versions could have an infinite loop when breaking into subtokens
        # if there was any out-of-alphabet characters in the encoded string.
        encoded = encoder.encode(original)
        decoded = encoder.decode(encoded)

        self.assertEqual(original, decoded)
        encoded_str = ''.join(encoded)
        self.assertIn('\\84;', encoded_str)
 def test_token_counts(self):
     token_counts = SubwordTextTokenizer._count_tokens(self.corpus)
     expected = {
         u"'": 2,
         u".": 2,
         u". ": 1,
         u"... ": 1,
         u"Groucho": 1,
         u"Marx": 1,
         u"Mitch": 1,
         u"Hedberg": 1,
         u"I": 3,
         u"in": 2,
         u"my": 2,
         u"pajamas": 2,
     }
     self.assertDictContainsSubset(expected, token_counts)
def test_is_pickleable():
    tokenizer = SubwordTextTokenizer()
    pickle.dumps(tokenizer)