class TestDecode(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharDictTokenizer.decode`."""
    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharDictTokenizer.decode),
            inspect.Signature(parameters=[
                inspect.Parameter(
                    name='self',
                    kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                ),
                inspect.Parameter(name='token_ids',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=Iterable[int],
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='remove_special_tokens',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=bool,
                                  default=False)
            ],
                              return_annotation=str),
            msg=msg)

    def test_invalid_input_token_ids(self):
        r"""Raise `TypeError` when input `token_ids` is invalid."""
        msg1 = 'Must raise `TypeError` when input `token_ids` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
            [0.0],
            [1.0],
            [math.nan],
            [-math.nan],
            [math.inf],
            [-math.inf],
            [0j],
            [1j],
            [''],
            [b''],
            [()],
            [[]],
            [{}],
            [set()],
            [object()],
            [lambda x: x],
            [type],
            [None],
            [NotImplemented],
            [...],
            [0, 0.0],
            [0, 1.0],
            [0, math.nan],
            [0, -math.nan],
            [0, math.inf],
            [0, -math.inf],
            [0, 0j],
            [0, 1j],
            [0, ''],
            [0, b''],
            [0, ()],
            [0, []],
            [0, {}],
            [0, set()],
            [0, object()],
            [0, lambda x: x],
            [0, type],
            [0, None],
            [0, NotImplemented],
            [0, ...],
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.decode(token_ids=invalid_input)

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`token_ids` must be an instance of `Iterable[int]`.',
                    msg=msg2)

    def test_invalid_input_remove_special_tokens(self):
        r"""Raise `TypeError` when input `remove_special_tokens` is invalid."""
        msg1 = ('Must raise `TypeError` when input `remove_special_tokens` is '
                'invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.decode(token_ids=[],
                                     remove_special_tokens=invalid_input)

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`remove_special_tokens` must be an instance of `bool`.',
                    msg=msg2)

    def test_return_type(self):
        r"""Return `str`."""
        msg = 'Must return `str`.'
        examples = (
            [0, 1, 2, 3],
            [4, 5, 6, 7, 8, 9],
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            [0],
            [],
        )

        for token_ids in examples:
            for tokenizer in self.tokenizers:
                self.assertIsInstance(tokenizer.decode(token_ids=token_ids),
                                      str,
                                      msg=msg)

    def test_remove_special_tokens(self):
        r"""Remove special tokens."""
        msg = 'Must remove special tokens.'
        examples = (
            (
                False,
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                '[bos]Hello World![eos][pad]',
                '[bos]hello world![eos][pad]',
            ),
            (
                False,
                [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2],
                '[bos]I am a [unk][eos][pad][pad]',
                '[bos]i am a [unk][eos][pad][pad]',
            ),
            (
                False,
                [0, 19, 4, 6, 16, 6, 17, 8, 18, 1],
                '[bos][unk]legend.[eos]',
                '[bos][unk]legend.[eos]',
            ),
            (
                True,
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                'Hello World!',
                'hello world!',
            ),
            (
                True,
                [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2],
                'I am a [unk]',
                'i am a [unk]',
            ),
            (
                True,
                [0, 19, 4, 6, 16, 6, 17, 8, 18, 1],
                '[unk]legend.',
                '[unk]legend.',
            ),
        )

        for (remove_special_tokens, token_ids, cased_sequence,
             uncased_sequence) in examples:
            self.assertEqual(self.cased_tokenizer.decode(
                token_ids=token_ids,
                remove_special_tokens=remove_special_tokens),
                             cased_sequence,
                             msg=msg)
            self.assertEqual(self.uncased_tokenizer.decode(
                token_ids=token_ids,
                remove_special_tokens=remove_special_tokens),
                             uncased_sequence,
                             msg=msg)
Example #2
0
class TestTokenize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharDictTokenizer.tokenize`."""
    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharDictTokenizer.tokenize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=List[str]),
            msg=msg)

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            b'',
            0j,
            1j,
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.tokenize(invalid_input)

                self.assertEqual(ctx_man.exception.args[0],
                                 '`sequence` must be an instance of `str`.',
                                 msg=msg2)

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            'Hello world!',
            'H',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                tokens = tokenizer.tokenize(sequence)
                self.assertIsInstance(tokens, list, msg=msg)
                for token in tokens:
                    self.assertIsInstance(token, str, msg=msg)

    def test_normalize(self):
        r"""Return sequence is normalized."""
        msg = 'Return sequence must be normalized.'
        examples = (
            (
                ' HeLlO WoRlD!',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                'HeLlO WoRlD! ',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                '  HeLlO  WoRlD!  ',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                '0',
                ['0'],
                ['0'],
            ),
            (
                'é',
                [unicodedata.normalize('NFKC', 'é')],
                [unicodedata.normalize('NFKC', 'é')],
            ),
            (
                '0é',
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
            ),
            (
                '',
                [],
                [],
            ),
        )

        for sequence, cased_tokens, uncased_tokens in examples:
            self.assertEqual(self.cased_tokenizer.tokenize(sequence),
                             cased_tokens,
                             msg=msg)
            self.assertEqual(self.uncased_tokenizer.tokenize(sequence),
                             uncased_tokens,
                             msg=msg)
class TestEncode(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharDictTokenizer.encode`."""
    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharDictTokenizer.encode),
            inspect.Signature(parameters=[
                inspect.Parameter(
                    name='self',
                    kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                ),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='max_seq_len',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=int,
                                  default=-1)
            ],
                              return_annotation=List[int]),
            msg=msg)

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.encode(sequence=invalid_input)

                self.assertEqual(cxt_man.exception.args[0],
                                 '`sequence` must be an instance of `str`.',
                                 msg=msg2)

    def test_invalid_input_max_seq_len(self):
        r"""Raise exception when input `max_seq_len` is invalid."""
        msg1 = (
            'Must raise `TypeError` or `ValueError` when input `max_seq_len` '
            'is invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -2,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises((TypeError, ValueError),
                                       msg=msg1) as cxt_man:
                    tokenizer.encode(sequence='', max_seq_len=invalid_input)

                if isinstance(cxt_man.exception, TypeError):
                    self.assertEqual(
                        cxt_man.exception.args[0],
                        '`max_seq_len` must be an instance of `int`.',
                        msg=msg2)
                else:
                    self.assertEqual(
                        cxt_man.exception.args[0],
                        '`max_seq_len` must be greater than `1` or equal to '
                        '`-1`.',
                        msg=msg2)

    def test_return_type(self):
        r"""Return `List[int]`."""
        msg = 'Must return `List[int]`.'
        examples = (
            'Hello world!',
            'I am a legend.',
            'y = f(x)',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                token_ids = tokenizer.encode(sequence=sequence)
                self.assertIsInstance(token_ids, list, msg=msg)
                for token_id in token_ids:
                    self.assertIsInstance(token_id, int, msg=msg)

    def test_encode_format(self):
        r"""Follow encode format."""
        msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1],
            ),
            (
                'I am a legend.',
                [0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1],
            ),
            (
                'y = f(x)',
                [0, 3, 5, 3, 5, 3, 3, 3, 3, 1],
            ),
            (
                '',
                [0, 1],
            ),
        )

        for sequence, token_ids in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence),
                                 token_ids,
                                 msg=msg)

    def test_truncate(self):
        r"""Token ids' length must not exceed `max_seq_len`."""
        msg = 'Token ids\' length must not exceed `max_seq_len`.'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 1],
                10,
            ),
            (
                'I am a legend.',
                [0, 14, 5, 9, 1],
                5,
            ),
            (
                'y = f(x)',
                [0, 3, 1],
                3,
            ),
            (
                '',
                [0, 1],
                2,
            ),
        )

        for sequence, token_ids, max_seq_len in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence,
                                                  max_seq_len=max_seq_len),
                                 token_ids,
                                 msg=msg)

    def test_padding(self):
        r"""Token ids' length must pad to `max_seq_len`."""
        msg = 'Token ids\' length must pad to `max_seq_len`.'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                15,
            ),
            (
                'I am a legend.',
                [
                    0,
                    14,
                    5,
                    9,
                    15,
                    5,
                    9,
                    5,
                    4,
                    6,
                    16,
                    6,
                    17,
                    8,
                    18,
                    1,
                    2,
                    2,
                    2,
                    2,
                ],
                20,
            ),
            (
                'y = f(x)',
                [
                    0,
                    3,
                    5,
                    3,
                    5,
                    3,
                    3,
                    3,
                    3,
                    1,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                ],
                20,
            ),
            (
                '',
                [0, 1, 2, 2, 2, 2, 2, 2, 2, 2],
                10,
            ),
        )

        for sequence, token_ids, max_seq_len in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence,
                                                  max_seq_len=max_seq_len),
                                 token_ids,
                                 msg=msg)
class TestVocabSize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharDictTokenizer.vocab_size`."""

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent property signature.'

        self.assertTrue(
            inspect.isdatadescriptor(CharDictTokenizer.vocab_size),
            msg=msg
        )
        self.assertFalse(
            inspect.isfunction(CharDictTokenizer.vocab_size),
            msg=msg
        )
        self.assertFalse(
            inspect.ismethod(CharDictTokenizer.vocab_size),
            msg=msg
        )

    def test_return_type(self):
        r"""Return `int`"""
        msg = 'Must return `int`.'

        for tokenizer in self.tokenizers:
            self.assertIsInstance(tokenizer.vocab_size, int, msg=msg)

    def test_return_value(self):
        r"""Return vocabulary size."""
        msg = 'Inconsistent vocabulary size.'

        for tokenizer in self.tokenizers:
            self.assertEqual(tokenizer.vocab_size, 4, msg=msg)

    def test_increase_vocab_size(self):
        r"""Increase vocabulary size after `build_vocab`."""
        msg = 'Must increase vocabulary size after `build_vocab`.'
        examples = (
            (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15),
            (('y = f(x)',), 24, 21),
            (('',), 24, 21),
        )

        sp_tokens_size = len(list(CharDictTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )

    def test_reset_vocab_size(self):
        r"""Reset vocabulary size after `reset_vocab`."""
        msg = 'Must reset vocabulary size after `reset_vocab`.'
        examples = (
            ('HeLlO WoRlD!', 'I aM a LeGeNd.'),
            ('y = f(x)',),
            ('',),
        )

        sp_tokens_size = len(list(CharDictTokenizer.special_tokens()))

        for batch_sequences in examples:
            for tokenizer in self.tokenizers:
                tokenizer.build_vocab(batch_sequences)
                tokenizer.reset_vocab()
                self.assertEqual(
                    tokenizer.vocab_size,
                    sp_tokens_size,
                    msg=msg
                )