Esempio n. 1
0
class TestTokenize(unittest.TestCase):
    r"""Test case for `lmp.tokenizer.CharDictTokenizer.tokenize`."""
    @classmethod
    def setUpClass(cls):
        cls.vocab_source = [
            'Hello World!',
            'I am a legend.',
        ]

    @classmethod
    def tearDownClass(cls):
        del cls.vocab_source
        gc.collect()

    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharDictTokenizer.tokenize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=List[str]),
            msg=msg)

    def test_invalid_input_sequence(self):
        r"""Raise `TypeError` when input `sequence` is invalid."""
        msg1 = 'Must raise `TypeError` when input `sequence` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            b'',
            0j,
            1j,
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.tokenize(invalid_input)

                self.assertEqual(ctx_man.exception.args[0],
                                 '`sequence` must be an instance of `str`.',
                                 msg=msg2)

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            'Hello world!',
            'H',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                tokens = tokenizer.tokenize(sequence)
                self.assertIsInstance(tokens, list, msg=msg)
                for token in tokens:
                    self.assertIsInstance(token, str, msg=msg)

    def test_normalize(self):
        r"""Return sequence is normalized."""
        msg = 'Return sequence must be normalized.'
        examples = (
            (
                ' HeLlO WoRlD!',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                'HeLlO WoRlD! ',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                '  HeLlO  WoRlD!  ',
                [
                    'H',
                    'e',
                    'L',
                    'l',
                    'O',
                    ' ',
                    'W',
                    'o',
                    'R',
                    'l',
                    'D',
                    '!',
                ],
                [
                    'h',
                    'e',
                    'l',
                    'l',
                    'o',
                    ' ',
                    'w',
                    'o',
                    'r',
                    'l',
                    'd',
                    '!',
                ],
            ),
            (
                '0',
                ['0'],
                ['0'],
            ),
            (
                'é',
                [unicodedata.normalize('NFKC', 'é')],
                [unicodedata.normalize('NFKC', 'é')],
            ),
            (
                '0é',
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
                [
                    unicodedata.normalize('NFKC', '0'),
                    unicodedata.normalize('NFKC', 'é'),
                ],
            ),
            (
                '',
                [],
                [],
            ),
        )

        for sequence, cased_tokens, uncased_tokens in examples:
            self.assertEqual(self.cased_tokenizer.tokenize(sequence),
                             cased_tokens,
                             msg=msg)
            self.assertEqual(self.uncased_tokenizer.tokenize(sequence),
                             uncased_tokens,
                             msg=msg)
Esempio n. 2
0
class TestTokenize(unittest.TestCase):
    r"""Test Case for `lmp.tokenizer.CharDictTokenizer.tokenize`."""
    def setUp(self):
        r"""Setup both cased and uncased tokenizer instances."""
        self.cased_tokenizer = CharDictTokenizer()
        self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
        self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]

    def tearDown(self):
        r"""Delete both cased and uncased tokenizer instances."""
        del self.tokenizers
        del self.cased_tokenizer
        del self.uncased_tokenizer
        gc.collect()

    def test_signature(self):
        r"""Ensure signature consistency."""
        msg = 'Inconsistent method signature.'

        self.assertEqual(
            inspect.signature(CharDictTokenizer.tokenize),
            inspect.Signature(parameters=[
                inspect.Parameter(name='self',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  default=inspect.Parameter.empty),
                inspect.Parameter(name='sequence',
                                  kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
                                  annotation=str,
                                  default=inspect.Parameter.empty)
            ],
                              return_annotation=List[str]),
            msg=msg)

    def test_invalid_input(self):
        r"""Raise `TypeError` when input is invalid."""
        msg1 = 'Must raise `TypeError` when input is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            math.inf,
            True,
            False,
            b'',
            [],
            (),
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
        )

        for invalid_input in examples:
            for tokenizer in self.tokenizers:
                with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                    tokenizer.tokenize(invalid_input)

                self.assertEqual(ctx_man.exception.args[0],
                                 '`sequence` must be instance of `str`.',
                                 msg=msg2)

    def test_return_type(self):
        r"""Return `List[str]`."""
        msg = 'Must return `List[str]`.'
        examples = (
            'Hello world!',
            '',
        )

        for sequence in examples:
            for tokenizer in self.tokenizers:
                tokens = tokenizer.tokenize(sequence)
                self.assertIsInstance(tokens, list, msg=msg)
                for token in tokens:
                    self.assertIsInstance(token, str, msg=msg)

    def test_unicode_normalize(self):
        r"""Return NFKC normalized characters."""
        msg = 'Must return NFKC normalized characters.'
        examples = (
            ('0', ['0']),
            ('é', ['é']),
            ('0é', ['0', 'é']),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                out_tokens = tokenizer.tokenize(sequence)
                self.assertEqual(out_tokens, ans_tokens, msg=msg)
                for out_token in out_tokens:
                    self.assertEqual(len(out_token), 1, msg=msg)

    def test_cased_sensitive(self):
        r"""Return cased sensitive characters when `is_uncased=False`."""
        msg = ('Return result must be case-sensitive when construct with '
               '`is_uncased=False`.')
        examples = (
            ('HeLlO WoRlD!',
             ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!']),
            ('HELLO WORLD!',
             ['H', 'E', 'L', 'L', 'O', ' ', 'W', 'O', 'R', 'L', 'D', '!']),
            ('hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('H', ['H']),
            ('h', ['h']),
        )

        for sequence, ans_tokens in examples:
            self.assertEqual(self.cased_tokenizer.tokenize(sequence),
                             ans_tokens,
                             msg=msg)

    def test_cased_insensitive(self):
        r"""Return cased insensitive characters when `is_uncased=True`."""
        msg = ('Return result must be case-insensitive when construct with '
               '`is_uncased=True`.')
        examples = (
            ('HeLlO WoRlD!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('HELLO WORLD!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('H', ['h']),
            ('h', ['h']),
        )

        for sequence, ans_tokens in examples:
            self.assertEqual(self.uncased_tokenizer.tokenize(sequence),
                             ans_tokens,
                             msg=msg)

    def test_whitespace_strip(self):
        r"""Strip input sequence."""
        msg = (
            'Input sequence must strip both leading and trailing whitespace '
            'characters.')
        examples = (
            ('  hello world!  ',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('  hello world!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('hello world!  ',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            ('\nhello world!\n',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']),
            (' ', []),
            ('', []),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.tokenize(sequence),
                                 ans_tokens,
                                 msg=msg)

    def test_whitespace_collapse(self):
        r"""Collapse whitespace characters."""
        msg = ('Input sequence must convert consecutive whitespace characters '
               'into single whitespace character.')
        examples = (
            ('hello  world  !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello   world  !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello  world   !',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
            ('hello  world\n\n!',
             ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ',
              '!']),
        )

        for sequence, ans_tokens in examples:
            for tokenizer in self.tokenizers:
                self.assertEqual(tokenizer.tokenize(sequence),
                                 ans_tokens,
                                 msg=msg)