Example #1
0
 def setUp(self):
     r"""Setup both cased and uncased tokenizer instances."""
     self.cased_tokenizer = CharListTokenizer()
     self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
     self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
     self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
     self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
Example #2
0
    def test_yield_value(self):
        r"""Return iterator which yield `str`."""
        msg = 'Must return iterator which yield `str`.'
        examples = ('[bos]', '[eos]', '[pad]', '[unk]')

        self.assertIsInstance(CharListTokenizer.special_tokens(),
                              Iterator,
                              msg=msg)

        out_tokens = list(CharListTokenizer.special_tokens())

        for i, ans_token in enumerate(examples):
            self.assertIsInstance(out_tokens[i], str, msg=msg)
            self.assertEqual(out_tokens[i], ans_token, msg=msg)
Example #3
0
    def test_experiment_does_not_exist(self):
        r"""Raise `FileNotFoundError` when `experiment` does not exist."""
        msg1 = (
            'Must raise `FileNotFoundError` when `experiment` does not exist.')
        msg2 = 'Inconsistent error message.'
        examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST')

        for experiment in examples:
            with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man:
                CharListTokenizer.load(experiment=experiment)

            test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json')
            self.assertEqual(ctx_man.exception.args[0],
                             f'File {test_path} does not exist.',
                             msg=msg2)
Example #4
0
    def test_cased_sensitive(self):
        r"""Vocabulary must be case sensitive."""
        msg = 'Vocabulary must be case sensitive.'
        examples = (
            (('ABCD', 'abcd'), 8, 4),
            (('efghi', 'EFGHI'), 10, 5),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.reset_vocab()
            self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.reset_vocab()
            self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )
Example #5
0
    def test_invalid_input_experiment(self):
        r"""Raise exception when input `experiment` is invalid."""
        msg1 = (
            'Must raise `TypeError` or `ValueError` when input `experiment` '
            'is invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises((TypeError, ValueError),
                                   msg=msg1) as ctx_man:
                CharListTokenizer.load(experiment=invalid_input)

            if isinstance(ctx_man.exception, TypeError):
                self.assertEqual(ctx_man.exception.args[0],
                                 '`experiment` must be an instance of `str`.',
                                 msg=msg2)
            else:
                self.assertEqual(ctx_man.exception.args[0],
                                 '`experiment` must not be empty.',
                                 msg=msg2)
Example #6
0
    def test_load_result(self):
        r"""Load `tokenizer.json`."""
        msg = 'Inconsistent `tokenizer.json` format.'
        examples = (
            {
                'is_uncased': False,
                'token_to_id': {
                    'A': 0,
                    'B': 1,
                    'C': 2,
                },
            },
            {
                'is_uncased': True,
                'token_to_id': {
                    'a': 0,
                    'b': 1,
                    'c': 2,
                },
            },
        )

        test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json')

        for obj in examples:
            try:
                # Create test file.
                with open(test_path, 'w', encoding='utf-8') as output_file:
                    json.dump(obj, output_file)

                tokenizer = CharListTokenizer.load(
                    experiment=self.__class__.experiment)

                self.assertIsInstance(tokenizer, CharListTokenizer, msg=msg)

                for attr_key, attr_value in obj.items():
                    self.assertTrue(hasattr(tokenizer, attr_key), msg=msg)
                    self.assertIsInstance(getattr(tokenizer, attr_key),
                                          type(attr_value),
                                          msg=msg)
                    self.assertEqual(getattr(tokenizer, attr_key),
                                     attr_value,
                                     msg=msg)
            finally:
                # Clean up test file.
                os.remove(test_path)
Example #7
0
    def test_reset_vocab_size(self):
        r"""Reset vocabulary size after `reset_vocab`."""
        msg = 'Must reset vocabulary size after `reset_vocab`.'
        examples = (
            ('HeLlO WoRlD!', 'I aM a LeGeNd.'),
            ('y = f(x)',),
            ('',),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences in examples:
            for tokenizer in self.tokenizers:
                tokenizer.build_vocab(batch_sequences)
                tokenizer.reset_vocab()
                self.assertEqual(
                    tokenizer.vocab_size,
                    sp_tokens_size,
                    msg=msg
                )
    def test_invalid_input_is_uncased(self):
        r"""Raise `TypeError` when input `is_uncased` is invalid."""
        msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                CharListTokenizer(is_uncased=invalid_input)

            self.assertEqual(ctx_man.exception.args[0],
                             '`is_uncased` must be an instance of `bool`.',
                             msg=msg2)
Example #9
0
    def test_increase_vocab_size(self):
        r"""Increase vocabulary size after `build_vocab`."""
        msg = 'Must increase vocabulary size after `build_vocab`.'
        examples = (
            (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15),
            (('y = f(x)',), 24, 21),
            (('',), 24, 21),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )
Example #10
0
 def setUp(self):
     r"""Setup both cased and uncased tokenizer instances."""
     self.cased_tokenizer = CharListTokenizer()
     self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
     self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
Example #11
0
class TestTokenize(unittest.TestCase):
    r"""Test Case for `lmp.tokenizer.CharListTokenizer.tokenize`."""
    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.tokenize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=List[str]),
            msg=msg)

    def test_invalid_input(self):
        r"""Raise `TypeError` when input is invalid."""
        msg1 = 'Must raise `TypeError` when input is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            math.inf,
            True,
            False,
            b'',
            [],
            (),
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.tokenize(invalid_input)

                self.assertEqual(ctx_man.exception.args[0],
                                 '`sequence` must be instance of `str`.',
                                 msg=msg2)

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            'Hello world!',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                tokens = tokenizer.tokenize(sequence)
                self.assertIsInstance(tokens, list, msg=msg)
                for token in tokens:
                    self.assertIsInstance(token, str, msg=msg)

    def test_unicode_normalize(self):
        r"""Return NFKC normalized characters."""
        msg = 'Must return NFKC normalized characters.'
        examples = (
            ('0', ['0']),
            ('é', ['é']),
            ('0é', ['0', 'é']),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                out_tokens = tokenizer.tokenize(sequence)
                self.assertEqual(out_tokens, ans_tokens, msg=msg)
                for out_token in out_tokens:
                    self.assertEqual(len(out_token), 1, msg=msg)

    def test_cased_sensitive(self):
        r"""Return cased sensitive characters when `is_uncased=False`."""
        msg = ('Return result must be case-sensitive when construct with '
               '`is_uncased=False`.')
        examples = (
            ('HeLlO WoRlD!',
             ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!']),
            ('HELLO WORLD!',
             ['H', 'E', 'L', 'L', 'O', ' ', 'W', 'O', 'R', 'L', 'D', '!']),
            ('hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('H', ['H']),
            ('h', ['h']),
        )

        for sequence, ans_tokens in examples:
            self.assertEqual(self.cased_tokenizer.tokenize(sequence),
                             ans_tokens,
                             msg=msg)

    def test_cased_insensitive(self):
        r"""Return cased insensitive characters when `is_uncased=True`."""
        msg = ('Return result must be case-insensitive when construct with '
               '`is_uncased=True`.')
        examples = (
            ('HeLlO WoRlD!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('HELLO WORLD!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('H', ['h']),
            ('h', ['h']),
        )

        for sequence, ans_tokens in examples:
            self.assertEqual(self.uncased_tokenizer.tokenize(sequence),
                             ans_tokens,
                             msg=msg)

    def test_whitespace_strip(self):
        r"""Strip input sequence."""
        msg = (
            'Input sequence must strip both leading and trailing whitespace '
            'characters.')
        examples = (
            ('  hello world!  ',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('  hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('hello world!  ',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('\nhello world!\n',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            (' ', []),
            ('', []),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.tokenize(sequence),
                                 ans_tokens,
                                 msg=msg)

    def test_whitespace_collapse(self):
        r"""Collapse whitespace characters."""
        msg = ('Input sequence must convert consecutive whitespace characters '
               'into single whitespace character.')
        examples = (
            ('hello  world  !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello   world  !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello  world   !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello  world\n\n!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.tokenize(sequence),
                                 ans_tokens,
                                 msg=msg)
Example #12
0
class TestNormalize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.normalize`."""
    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.normalize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=str),
            msg=msg)

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.normalize(sequence=invalid_input)

                self.assertEqual(cxt_man.exception.args[0],
                                 '`sequence` must be an instance of `str`.',
                                 msg=msg2)

    def test_return_type(self):
        r"""Return `str`."""
        msg = 'Must return `str`.'
        examples = (
            'Hello world!',
            'I am a legend.',
            'y = f(x)',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                self.assertIsInstance(tokenizer.normalize(sequence=sequence),
                                      str,
                                      msg=msg)

    def test_unicode_normalize(self):
        r"""Return NFKC normalized characters."""
        msg = 'Must return NFKC normalized characters.'
        examples = (
            ('0', '0', 1),
            ('é', 'é', 1),
            ('0é', '0é', 2),
        )

        for sequence, normalized_sequence, sequence_len in examples:
            for tokenizer in self.tokenizers:
                out_sequence = tokenizer.normalize(sequence=sequence)
                self.assertEqual(out_sequence, normalized_sequence, msg=msg)
                self.assertEqual(len(out_sequence), sequence_len, msg=msg)

    def test_cased_sensitive(self):
        r"""Return cased sensitive sequence."""
        msg = 'Return sequence must be cased sensitive.'
        examples = (
            ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'),
            ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'),
            ('hello world!', 'hello world!', 'hello world!'),
            ('H', 'H', 'h'),
            ('h', 'h', 'h'),
        )

        for sequence, cased_sequence, uncased_sequence in examples:
            self.assertEqual(self.cased_tokenizer.normalize(sequence),
                             cased_sequence,
                             msg=msg)
            self.assertEqual(self.uncased_tokenizer.normalize(sequence),
                             uncased_sequence,
                             msg=msg)

    def test_whitespace_strip(self):
        r"""Strip input sequence."""
        msg = 'Must strip both leading and trailing whitespace characters.'
        examples = (
            (' hello world!', 'hello world!'),
            ('hello world! ', 'hello world!'),
            (' hello world! ', 'hello world!'),
            ('  hello world!   ', 'hello world!'),
            ('\nhello world!\n', 'hello world!'),
            (' ', ''),
            ('', ''),
        )

        for sequence, stripped_sequence in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.normalize(sequence),
                                 stripped_sequence,
                                 msg=msg)

    def test_whitespace_collapse(self):
        r"""Collapse whitespace characters."""
        msg = ('Must convert consecutive whitespace characters into single '
               'whitespace character.')
        examples = (
            ('hello  world  !', 'hello world !'),
            ('hello   world  !', 'hello world !'),
            ('hello  world   !', 'hello world !'),
            ('hello  world\n\n!', 'hello world !'),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.normalize(sequence),
                                 ans_tokens,
                                 msg=msg)
Example #13
0
class TestEncode(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.encode`."""
    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.encode),
            inspect.Signature(parameters=[
                inspect.Parameter(
                    name='self',
                    kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                ),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='max_seq_len',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=int,
                                  default=-1)
            ],
                              return_annotation=List[int]),
            msg=msg)

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.encode(sequence=invalid_input)

                self.assertEqual(cxt_man.exception.args[0],
                                 '`sequence` must be an instance of `str`.',
                                 msg=msg2)

    def test_invalid_input_max_seq_len(self):
        r"""Raise exception when input `max_seq_len` is invalid."""
        msg1 = (
            'Must raise `TypeError` or `ValueError` when input `max_seq_len` '
            'is invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -2,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises((TypeError, ValueError),
                                       msg=msg1) as cxt_man:
                    tokenizer.encode(sequence='', max_seq_len=invalid_input)

                if isinstance(cxt_man.exception, TypeError):
                    self.assertEqual(
                        cxt_man.exception.args[0],
                        '`max_seq_len` must be an instance of `int`.',
                        msg=msg2)
                else:
                    self.assertEqual(
                        cxt_man.exception.args[0],
                        '`max_seq_len` must be greater than `1` or equal to '
                        '`-1`.',
                        msg=msg2)

    def test_return_type(self):
        r"""Return `List[int]`."""
        msg = 'Must return `List[int]`.'
        examples = (
            'Hello world!',
            'I am a legend.',
            'y = f(x)',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                token_ids = tokenizer.encode(sequence=sequence)
                self.assertIsInstance(token_ids, list, msg=msg)
                for token_id in token_ids:
                    self.assertIsInstance(token_id, int, msg=msg)

    def test_encode_format(self):
        r"""Follow encode format."""
        msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1],
            ),
            (
                'I am a legend.',
                [0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1],
            ),
            (
                'y = f(x)',
                [0, 3, 5, 3, 5, 3, 3, 3, 3, 1],
            ),
            (
                '',
                [0, 1],
            ),
        )

        for sequence, token_ids in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence),
                                 token_ids,
                                 msg=msg)

    def test_truncate(self):
        r"""Token ids' length must not exceed `max_seq_len`."""
        msg = 'Token ids\' length must not exceed `max_seq_len`.'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 1],
                10,
            ),
            (
                'I am a legend.',
                [0, 14, 5, 9, 1],
                5,
            ),
            (
                'y = f(x)',
                [0, 3, 1],
                3,
            ),
            (
                '',
                [0, 1],
                2,
            ),
        )

        for sequence, token_ids, max_seq_len in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence,
                                                  max_seq_len=max_seq_len),
                                 token_ids,
                                 msg=msg)

    def test_padding(self):
        r"""Token ids' length must pad to `max_seq_len`."""
        msg = 'Token ids\' length must pad to `max_seq_len`.'
        examples = (
            (
                'Hello World!',
                [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                15,
            ),
            (
                'I am a legend.',
                [
                    0,
                    14,
                    5,
                    9,
                    15,
                    5,
                    9,
                    5,
                    4,
                    6,
                    16,
                    6,
                    17,
                    8,
                    18,
                    1,
                    2,
                    2,
                    2,
                    2,
                ],
                20,
            ),
            (
                'y = f(x)',
                [
                    0,
                    3,
                    5,
                    3,
                    5,
                    3,
                    3,
                    3,
                    3,
                    1,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                    2,
                ],
                20,
            ),
            (
                '',
                [0, 1, 2, 2, 2, 2, 2, 2, 2, 2],
                10,
            ),
        )

        for sequence, token_ids, max_seq_len in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.encode(sequence=sequence,
                                                  max_seq_len=max_seq_len),
                                 token_ids,
                                 msg=msg)
Example #14
0
class TestVocabSize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.vocab_size`."""

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent property signature.'

        self.assertTrue(
            inspect.isdatadescriptor(CharListTokenizer.vocab_size),
            msg=msg
        )
        self.assertFalse(
            inspect.isfunction(CharListTokenizer.vocab_size),
            msg=msg
        )
        self.assertFalse(
            inspect.ismethod(CharListTokenizer.vocab_size),
            msg=msg
        )

    def test_return_type(self):
        r"""Return `int`"""
        msg = 'Must return `int`.'

        for tokenizer in self.tokenizers:
            self.assertIsInstance(tokenizer.vocab_size, int, msg=msg)

    def test_return_value(self):
        r"""Return vocabulary size."""
        msg = 'Inconsistent vocabulary size.'

        for tokenizer in self.tokenizers:
            self.assertEqual(tokenizer.vocab_size, 4, msg=msg)

    def test_increase_vocab_size(self):
        r"""Increase vocabulary size after `build_vocab`."""
        msg = 'Must increase vocabulary size after `build_vocab`.'
        examples = (
            (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15),
            (('y = f(x)',), 24, 21),
            (('',), 24, 21),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.build_vocab(batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )

    def test_reset_vocab_size(self):
        r"""Reset vocabulary size after `reset_vocab`."""
        msg = 'Must reset vocabulary size after `reset_vocab`.'
        examples = (
            ('HeLlO WoRlD!', 'I aM a LeGeNd.'),
            ('y = f(x)',),
            ('',),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences in examples:
            for tokenizer in self.tokenizers:
                tokenizer.build_vocab(batch_sequences)
                tokenizer.reset_vocab()
                self.assertEqual(
                    tokenizer.vocab_size,
                    sp_tokens_size,
                    msg=msg
                )
Example #15
0
class TestBatchDecode(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.batch_decode`."""

    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.batch_decode),
            inspect.Signature(
                parameters=[
                    inspect.Parameter(
                        name='self',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                    ),
                    inspect.Parameter(
                        name='batch_token_ids',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=Iterable[Iterable[int]],
                        default=inspect.Parameter.empty
                    ),
                    inspect.Parameter(
                        name='remove_special_tokens',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=bool,
                        default=False
                    )
                ],
                return_annotation=List[str]
            ),
            msg=msg
        )

    def test_invalid_input_batch_token_ids(self):
        r"""Raise `TypeError` when input `batch_token_ids` is invalid."""
        msg1 = (
            'Must raise `TypeError` when input `batch_token_ids` is invalid.'
        )
        msg2 = 'Inconsistent error message.'
        examples = (
            False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf,
            -math.inf, 0j, 1j, object(), lambda x: x, type, None,
            NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0],
            [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j],
            [object()], [lambda x: x], [type], [None], [NotImplemented], [...],
            [[], False], [[], True], [[], 0], [[], 1], [[], -1], [[], 0.0],
            [[], 1.0], [[], math.nan], [[], -math.nan], [[], math.inf],
            [[], -math.inf], [[], 0j], [[], 1j], [[], object()],
            [[], lambda x: x], [[], type], [[], None], [[], NotImplemented],
            [[], ...], [[0.0]], [[1.0]], [[math.nan]], [[-math.nan]],
            [[math.inf]], [[-math.inf]], [[0j]], [[1j]], [['']], [[b'']],
            [[()]], [[[]]], [[{}]], [[set()]], [[object()]], [[lambda x: x]],
            [[type]], [[None]], [[NotImplemented]], [[...]], [[0, 0.0]],
            [[0, 1.0]], [[0, math.nan]], [[0, -math.nan]], [[0, math.inf]],
            [[0, -math.inf]], [[0, 0j]], [[0, 1j]], [[0, '']], [[0, b'']],
            [[0, ()]], [[0, []]], [[0, {}]], [[0, set()]], [[0, object()]],
            [[0, lambda x: x]], [[0, type]], [[0, None]],
            [[0, NotImplemented]], [[0, ...]],
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.batch_decode(batch_token_ids=invalid_input)

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`batch_token_ids` must be an instance of '
                    '`Iterable[Iterable[int]]`.',
                    msg=msg2
                )

    def test_invalid_input_remove_special_tokens(self):
        r"""Raise `TypeError` when input `remove_special_tokens` is invalid."""
        msg1 = (
            'Must raise `TypeError` when input `remove_special_tokens` is '
            'invalid.'
        )
        msg2 = 'Inconsistent error message.'
        examples = (
            0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j,
            '', b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None,
            NotImplemented, ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.batch_decode(
                        batch_token_ids=[[]],
                        remove_special_tokens=invalid_input
                    )

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`remove_special_tokens` must be an instance of `bool`.',
                    msg=msg2
                )

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            [[0, 1, 2, 3], [4, 5, 6, 7, 8]],
            [[9, 10, 11, 12, 13], []],
            [[], [14, 15, 16, 17]],
            [[], []],
            [],
        )

        for batch_token_ids in examples:
            for tokenizer in self.tokenizers:
                batch_sequences = tokenizer.batch_decode(
                    batch_token_ids=batch_token_ids
                )
                self.assertIsInstance(batch_sequences, list, msg=msg)
                for sequence in batch_sequences:
                    self.assertIsInstance(sequence, str, msg=msg)

    def test_remove_special_tokens(self):
        r"""Remove special tokens."""
        msg = 'Must remove special tokens.'
        examples = (
            (
                False,
                [
                    [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                    [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2],
                    [0, 19, 4, 6, 16, 6, 17, 8, 18, 1],
                ],
                [
                    '[bos]Hello World![eos][pad]',
                    '[bos]I am a [unk][eos][pad][pad]',
                    '[bos][unk]legend.[eos]',
                ],
                [
                    '[bos]hello world![eos][pad]',
                    '[bos]i am a [unk][eos][pad][pad]',
                    '[bos][unk]legend.[eos]',
                ],
            ),
            (
                True,
                [
                    [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2],
                    [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2],
                    [0, 19, 4, 6, 16, 6, 17, 8, 18, 1],
                ],
                [
                    'Hello World!',
                    'I am a [unk]',
                    '[unk]legend.',
                ],
                [
                    'hello world!',
                    'i am a [unk]',
                    '[unk]legend.',
                ],
            ),
        )

        for (
                remove_special_tokens,
                batch_token_ids,
                cased_batch_sequence,
                uncased_batch_sequence
        ) in examples:
            self.assertEqual(
                self.cased_tokenizer.batch_decode(
                    batch_token_ids=batch_token_ids,
                    remove_special_tokens=remove_special_tokens
                ),
                cased_batch_sequence,
                msg=msg
            )
            self.assertEqual(
                self.uncased_tokenizer.batch_decode(
                    batch_token_ids=batch_token_ids,
                    remove_special_tokens=remove_special_tokens
                ),
                uncased_batch_sequence,
                msg=msg
            )
class TestTokenize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.tokenize`."""

    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.tokenize),
            inspect.Signature(
                parameters=[
                    inspect.Parameter(
                        name='self',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        default=inspect.Parameter.empty
                    ),
                    inspect.Parameter(
                        name='sequence',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=str,
                        default=inspect.Parameter.empty
                    )
                ],
                return_annotation=List[str]
            ),
            msg=msg
        )

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf,
            -math.inf, b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x,
            type, None, NotImplemented, ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.tokenize(invalid_input)

                self.assertEqual(
                    ctx_man.exception.args[0],
                    '`sequence` must be an instance of `str`.',
                    msg=msg2
                )

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            'Hello world!',
            'H',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                tokens = tokenizer.tokenize(sequence)
                self.assertIsInstance(tokens, list, msg=msg)
                for token in tokens:
                    self.assertIsInstance(token, str, msg=msg)

    def test_normalize(self):
        r"""Return sequence is normalized."""
        msg = 'Return sequence must be normalized.'
        examples = (
            (
                ' HeLlO WoRlD!',
                [
                    'H', 'e', 'L', 'l', 'O', ' ',
                    'W', 'o', 'R', 'l', 'D', '!',
                ],
                [
                    'h', 'e', 'l', 'l', 'o', ' ',
                    'w', 'o', 'r', 'l', 'd', '!',
                ],
            ),
            (
                'HeLlO WoRlD! ',
                [
                    'H', 'e', 'L', 'l', 'O', ' ',
                    'W', 'o', 'R', 'l', 'D', '!',
                ],
                [
                    'h', 'e', 'l', 'l', 'o', ' ',
                    'w', 'o', 'r', 'l', 'd', '!',
                ],
            ),
            (
                '  HeLlO  WoRlD!  ',
                [
                    'H', 'e', 'L', 'l', 'O', ' ',
                    'W', 'o', 'R', 'l', 'D', '!',
                ],
                [
                    'h', 'e', 'l', 'l', 'o', ' ',
                    'w', 'o', 'r', 'l', 'd', '!',
                ],
            ),
            (
                '0',
                ['0'],
                ['0'],
            ),
            (
                'é',
                [unicodedata.normalize('NFKC', 'é')],
                [unicodedata.normalize('NFKC', 'é')],
            ),
            (
                '0é',
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
            ),
            (
                '',
                [],
                [],
            ),
        )

        for sequence, cased_tokens, uncased_tokens in examples:
            self.assertEqual(
                self.cased_tokenizer.tokenize(sequence),
                cased_tokens,
                msg=msg
            )
            self.assertEqual(
                self.uncased_tokenizer.tokenize(sequence),
                uncased_tokens,
                msg=msg
            )
Example #17
0
class TestBuildVocab(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharListTokenizer.build_vocab`."""

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.build_vocab),
            inspect.Signature(
                parameters=[
                    inspect.Parameter(
                        name='self',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        default=inspect.Parameter.empty
                    ),
                    inspect.Parameter(
                        name='batch_sequences',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=Iterable[str],
                        default=inspect.Parameter.empty
                    ),
                    inspect.Parameter(
                        name='min_count',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=int,
                        default=1
                    ),
                ],
                return_annotation=None
            ),
            msg=msg
        )

    def test_invalid_input_batch_sequences(self):
        r"""Raise `TypeError` when input `batch_sequences` is invalid."""
        msg1 = (
            'Must raise `TypeError` when input `batch_sequences` is invalid.'
        )
        msg2 = 'Inconsistent error message.'
        examples = (
            False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf,
            -math.inf, 0j, 1j, object(), lambda x: x, type, None,
            NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0],
            [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j],
            [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x],
            [type], [None], [NotImplemented], [...], ['', False], ['', True],
            ['', 0], ['', 1], ['', -1], ['', 0.0], ['', 1.0], ['', math.nan],
            ['', -math.nan], ['', math.inf], ['', -math.inf], ['', 0j],
            ['', 1j], ['', b''], ['', ()], ['', []], ['', {}], ['', set()],
            ['', object()], ['', lambda x: x], ['', type], ['', None],
            ['', NotImplemented], ['', ...],
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.build_vocab(batch_sequences=invalid_input)

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`batch_sequences` must be an instance of '
                    '`Iterable[str]`.',
                    msg=msg2
                )

    def test_invalid_input_min_count(self):
        r"""Raise `TypeError` when input `min_count` is invalid."""
        msg1 = 'Must raise `TypeError` when input `min_count` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '',
            b'', (), [], {}, set(), object(), lambda x: x, type, None,
            NotImplemented, ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.build_vocab(
                        batch_sequences=[],
                        min_count=invalid_input
                    )

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`min_count` must be an instance of `int`.',
                    msg=msg2
                )

    def test_cased_sensitive(self):
        r"""Vocabulary must be case sensitive."""
        msg = 'Vocabulary must be case sensitive.'
        examples = (
            (('ABCD', 'abcd'), 8, 4),
            (('efghi', 'EFGHI'), 10, 5),
        )

        sp_tokens_size = len(list(CharListTokenizer.special_tokens()))

        for batch_sequences, cased_vocab_size, uncased_vocab_size in examples:
            self.cased_tokenizer.reset_vocab()
            self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences)
            self.assertEqual(
                self.cased_tokenizer.vocab_size,
                cased_vocab_size + sp_tokens_size,
                msg=msg
            )
            self.uncased_tokenizer.reset_vocab()
            self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences)
            self.assertEqual(
                self.uncased_tokenizer.vocab_size,
                uncased_vocab_size + sp_tokens_size,
                msg=msg
            )

    def test_sort_by_token_frequency_in_descending_order(self):
        r"""Sort vocabulary by token frequency in descending order."""
        msg = (
            'Must sort vocabulary by token frequency in descending order.'
        )
        examples = (
            (
                ('AaAa', 'bBb', 'cC', 'd'),
                ('A', 'a', 'b', 'B', 'c', 'C', 'd'),
                ('a', 'b', 'c', 'd'),
            ),
            (
                ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'),
                ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'),
                ('e', 'f', 'g', 'h', 'i'),
            ),
        )

        for (
                batch_sequences,
                cased_vocab_order,
                uncased_vocab_order
        ) in examples:
            self.cased_tokenizer.reset_vocab()
            self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences)

            for (
                    vocab1,
                    vocab2
            ) in zip(cased_vocab_order[:-1], cased_vocab_order[1:]):
                self.assertLessEqual(
                    self.cased_tokenizer.convert_token_to_id(vocab1),
                    self.cased_tokenizer.convert_token_to_id(vocab2),
                    msg=msg
                )

            self.uncased_tokenizer.reset_vocab()
            self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences)

            for (
                    vocab1,
                    vocab2
            ) in zip(uncased_vocab_order[:-1], uncased_vocab_order[1:]):
                self.assertLessEqual(
                    self.uncased_tokenizer.convert_token_to_id(vocab1),
                    self.uncased_tokenizer.convert_token_to_id(vocab2),
                    msg=msg
                )

    def test_min_count(self):
        r"""Filter out tokens whose frequency is smaller than `min_count`."""
        msg = (
            'Must filter out tokens whose frequency is smaller than '
            '`min_count`.'
        )
        examples = (
            (
                ('AaAa', 'bBb', 'cC', 'd'),
                ('A', 'a', 'b'),
                ('B', 'c', 'C', 'd'),
                ('a', 'b', 'c'),
                ('d'),
                2,
            ),
            (
                ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'),
                ('E'),
                ('e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'),
                ('e', 'f', 'g'),
                ('h', 'i'),
                3,
            ),
            (
                ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'),
                (),
                ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'),
                (),
                ('e', 'f', 'g', 'h', 'i'),
                10,
            ),
        )

        for (
                batch_sequences,
                cased_known_token,
                cased_unknown_token,
                uncased_known_token,
                uncased_unknown_token,
                min_count
        ) in examples:
            self.cased_tokenizer.reset_vocab()
            self.cased_tokenizer.build_vocab(
                batch_sequences=batch_sequences,
                min_count=min_count
            )

            for token in cased_known_token:
                token_id = self.cased_tokenizer.convert_token_to_id(token)
                self.assertEqual(
                    token,
                    self.cased_tokenizer.convert_id_to_token(token_id),
                    msg=msg
                )

            unk_token_id = self.cased_tokenizer.convert_token_to_id(
                CharListTokenizer.unk_token
            )
            for unk_token in cased_unknown_token:
                self.assertEqual(
                    self.cased_tokenizer.convert_token_to_id(unk_token),
                    unk_token_id,
                    msg=msg
                )

            self.uncased_tokenizer.reset_vocab()
            self.uncased_tokenizer.build_vocab(
                batch_sequences=batch_sequences,
                min_count=min_count
            )

            for token in uncased_known_token:
                token_id = self.uncased_tokenizer.convert_token_to_id(token)
                self.assertEqual(
                    token,
                    self.uncased_tokenizer.convert_id_to_token(token_id),
                    msg=msg
                )

            unk_token_id = self.uncased_tokenizer.convert_token_to_id(
                CharListTokenizer.unk_token
            )
            for unk_token in uncased_unknown_token:
                self.assertEqual(
                    self.uncased_tokenizer.convert_token_to_id(unk_token),
                    unk_token_id,
                    msg=msg
                )
Example #18
0
class TestDetokenize(unittest.TestCase):
    r"""Test Case for `lmp.tokenizer.CharListTokenizer.detokenize`."""
    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharListTokenizer()
        self.uncased_tokenizer = CharListTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharListTokenizer.detokenize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='tokens',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=Iterable[str],
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=str),
            msg=msg)

    def test_invalid_input(self):
        r"""Raise `TypeError` when input is invalid."""
        msg1 = 'Must raise `TypeError` when input is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            math.inf,
            True,
            False,
            (1, 2, 3),
            [1, 2, 3],
            {1, 2, 3},
            None,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.detokenize(invalid_input)

                self.assertEqual(
                    ctx_man.exception.args[0],
                    '`tokens` must be instance of `Iterable[str]`.',
                    msg=msg2)

    def test_expected_return(self):
        r"""Return expected strings."""
        msg = 'Inconsistent detokenization result.'
        examples = (([
            'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!'
        ], 'Hello world!'), ([], ''))

        for tokens, ans_sequence in examples:
            for tokenizer in self.tokenizers:
                out_sequence = tokenizer.detokenize(tokens)
                self.assertIsInstance(out_sequence, str, msg=msg)
                self.assertEqual(out_sequence, ans_sequence, msg=msg)

    def test_case_insensitive(self):
        r"""Detokenize does not consider cases."""
        msg = 'Inconsistent detokenization result.'
        examples = (
            ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!'],
            ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!'],
        )

        for tokens in examples:
            self.assertEqual(self.cased_tokenizer.detokenize(tokens),
                             self.uncased_tokenizer.detokenize(tokens),
                             msg=msg)