Python BaseDictTokenizer Examples, lmp.tokenizer.BaseDictTokenizer Python Examples

Example #1

0

Show file

File: test_special_tokens.py Project: nail1021734/language-model-playground

    def test_yield_value(self):
        r"""Return iterator which yield `str`."""
        msg = 'Must return iterator which yield `str`.'
        examples = ('[bos]', '[eos]', '[pad]', '[unk]')

        self.assertIsInstance(BaseDictTokenizer.special_tokens(),
                              Iterator,
                              msg=msg)

        out_tokens = list(BaseDictTokenizer.special_tokens())

        for i, ans_token in enumerate(examples):
            self.assertIsInstance(out_tokens[i], str, msg=msg)
            self.assertEqual(out_tokens[i], ans_token, msg=msg)

Example #2

0

Show file

    def test_experiment_does_not_exist(self):
        r"""Raise `FileNotFoundError` when `experiment` does not exist."""
        msg1 = (
            'Must raise `FileNotFoundError` when `experiment` does not exist.')
        msg2 = 'Inconsistent error message.'
        examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST')

        for experiment in examples:
            with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man:
                BaseDictTokenizer.load(experiment=experiment)

            test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json')
            self.assertEqual(ctx_man.exception.args[0],
                             f'File {test_path} does not exist.',
                             msg=msg2)

Example #3

0

Show file

    def test_load_result(self):
        r"""Load `tokenizer.json`."""
        msg = 'Inconsistent `tokenizer.json` format.'
        examples = (
            {
                'is_uncased': False,
                'token_to_id': {
                    'A': 0,
                    'B': 1,
                    'C': 2,
                },
                'id_to_token': {
                    0: 'A',
                    1: 'B',
                    2: 'C',
                },
            },
            {
                'is_uncased': True,
                'token_to_id': {
                    'a': 0,
                    'b': 1,
                    'c': 2,
                },
                'id_to_token': {
                    0: 'a',
                    1: 'b',
                    2: 'c',
                },
            },
        )

        test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json')

        for obj in examples:
            try:
                # Create test file.
                with open(test_path, 'w', encoding='utf-8') as output_file:
                    tmp = {
                        'is_uncased': obj['is_uncased'],
                        'token_to_id': obj['token_to_id']
                    }
                    json.dump(tmp, output_file)

                tokenizer = BaseDictTokenizer.load(
                    experiment=self.__class__.experiment)

                self.assertIsInstance(tokenizer, BaseDictTokenizer, msg=msg)

                for attr_key, attr_value in obj.items():
                    self.assertTrue(hasattr(tokenizer, attr_key), msg=msg)
                    self.assertIsInstance(getattr(tokenizer, attr_key),
                                          type(attr_value),
                                          msg=msg)
                    self.assertEqual(getattr(tokenizer, attr_key),
                                     attr_value,
                                     msg=msg)
            finally:
                # Clean up test file.
                os.remove(test_path)

Example #4

0

Show file

    def test_invalid_input_experiment(self):
        r"""Raise exception when input `experiment` is invalid."""
        msg1 = (
            'Must raise `TypeError` or `ValueError` when input `experiment` '
            'is invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises((TypeError, ValueError),
                                   msg=msg1) as ctx_man:
                BaseDictTokenizer.load(experiment=invalid_input)

            if isinstance(ctx_man.exception, TypeError):
                self.assertEqual(ctx_man.exception.args[0],
                                 '`experiment` must be an instance of `str`.',
                                 msg=msg2)
            else:
                self.assertEqual(ctx_man.exception.args[0],
                                 '`experiment` must not be empty.',
                                 msg=msg2)

Example #5

0

Show file

File: test_tokenize.py Project: nail1021734/language-model-playground

    def test_abstract_method(self):
        r"""Raise `NotImplementedError` when subclass did not implement."""
        msg1 = (
            'Must raise `NotImplementedError` when subclass did not implement.'
        )
        msg2 = 'Inconsistent error message.'
        examples = (True, False)

        for is_uncased in examples:
            with self.assertRaises(NotImplementedError, msg=msg1) as ctx_man:
                BaseDictTokenizer(is_uncased=is_uncased).tokenize('')

            self.assertEqual(ctx_man.exception.args[0],
                             'In class `BaseDictTokenizer`: '
                             'method `tokenize` not implemented yet.',
                             msg=msg2)

Example #6

0

Show file

File: test_init.py Project: nail1021734/language-model-playground

    def test_invalid_input_is_uncased(self):
        r"""Raise `TypeError` when input `is_uncased` is invalid."""
        msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                BaseDictTokenizer(is_uncased=invalid_input)

            self.assertEqual(ctx_man.exception.args[0],
                             '`is_uncased` must be an instance of `bool`.',
                             msg=msg2)

Example #7

0

Show file

File: test_reset_vocab.py Project: nail1021734/language-model-playground

 def setUp(self):
     r"""Setup both cased and uncased tokenizer instances."""
     self.cased_tokenizer = BaseDictTokenizer()
     self.uncased_tokenizer = BaseDictTokenizer(is_uncased=True)
     self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

Example #8

0

Show file

class TestNormalize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.BaseDictTokenizer.normalize`."""

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = BaseDictTokenizer()
        self.uncased_tokenizer = BaseDictTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(BaseDictTokenizer.normalize),
            inspect.Signature(
                parameters=[
                    inspect.Parameter(
                        name='self',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        default=inspect.Parameter.empty
                    ),
                    inspect.Parameter(
                        name='sequence',
                        kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                        annotation=str,
                        default=inspect.Parameter.empty
                    )
                ],
                return_annotation=str
            ),
            msg=msg
        )

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf,
            -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x,
            type, None, NotImplemented, ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as cxt_man:
                    tokenizer.normalize(sequence=invalid_input)

                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`sequence` must be an instance of `str`.',
                    msg=msg2
                )

    def test_return_type(self):
        r"""Return `str`."""
        msg = 'Must return `str`.'
        examples = (
            'Hello world!',
            'I am a legend.',
            'y = f(x)',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                self.assertIsInstance(
                    tokenizer.normalize(sequence=sequence),
                    str,
                    msg=msg
                )

    def test_unicode_normalize(self):
        r"""Return NFKC normalized characters."""
        msg = 'Must return NFKC normalized characters.'
        examples = (
            ('０', '0', 1),
            ('é', 'é', 1),
            ('０é', '0é', 2),
        )

        for sequence, normalized_sequence, sequence_len in examples:
            for tokenizer in self.tokenizers:
                out_sequence = tokenizer.normalize(sequence=sequence)
                self.assertEqual(out_sequence, normalized_sequence, msg=msg)
                self.assertEqual(len(out_sequence), sequence_len, msg=msg)

    def test_cased_sensitive(self):
        r"""Return cased sensitive sequence."""
        msg = 'Return sequence must be cased sensitive.'
        examples = (
            ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'),
            ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'),
            ('hello world!', 'hello world!', 'hello world!'),
            ('H', 'H', 'h'),
            ('h', 'h', 'h'),
        )

        for sequence, cased_sequence, uncased_sequence in examples:
            self.assertEqual(
                self.cased_tokenizer.normalize(sequence),
                cased_sequence,
                msg=msg
            )
            self.assertEqual(
                self.uncased_tokenizer.normalize(sequence),
                uncased_sequence,
                msg=msg
            )

    def test_whitespace_strip(self):
        r"""Strip input sequence."""
        msg = 'Must strip both leading and trailing whitespace characters.'
        examples = (
            (' hello world!', 'hello world!'),
            ('hello world! ', 'hello world!'),
            (' hello world! ', 'hello world!'),
            ('  hello world!   ', 'hello world!'),
            ('\nhello world!\n', 'hello world!'),
            (' ', ''),
            ('', ''),
        )

        for sequence, stripped_sequence in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(
                    tokenizer.normalize(sequence),
                    stripped_sequence,
                    msg=msg
                )

    def test_whitespace_collapse(self):
        r"""Collapse whitespace characters."""
        msg = (
            'Must convert consecutive whitespace characters into single '
            'whitespace character.'
        )
        examples = (
            ('hello  world  !', 'hello world !'),
            ('hello   world  !', 'hello world !'),
            ('hello  world   !', 'hello world !'),
            ('hello  world\n\n!', 'hello world !'),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(
                    tokenizer.normalize(sequence),
                    ans_tokens,
                    msg=msg
                )