コード例 #1
0
    def test_reset_vocab_size(self):
        r"""Reset vocabulary size after `reset_vocab`."""
        msg = 'Must reset vocabulary size after `reset_vocab`.'
        examples = (
            ('HeLlO WoRlD!', 'I aM a LeGeNd.'),
            ('y = f(x)',),
            ('',),
        )

        sp_tokens_size = len(list(CharDictTokenizer.special_tokens()))

        for batch_sequences in examples:
            for tokenizer in self.tokenizers:
                tokenizer.build_vocab(batch_sequences)
                tokenizer.reset_vocab()
                self.assertEqual(
                    tokenizer.vocab_size,
                    sp_tokens_size,
                    msg=msg
                )
コード例 #2
0
    def test_increase_vocab_size(self):
        r"""Increase vocabulary size after `build_vocab`."""
        msg = 'Must increase vocabulary size after `build_vocab`.'
        examples = (
            (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15),
            (('y = f(x)',), 24, 21),
            (('',), 24, 21),
        )

        sp_tokens_size = len(list(CharDictTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )