def setUp(self):
     r"""Setup both cased and uncased tokenizer instances."""
     self.cased_tokenizer = CharDictTokenizer()
     self.cased_tokenizer.build_vocab(self.__class__.vocab_source)
     self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
     self.uncased_tokenizer.build_vocab(self.__class__.vocab_source)
     self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
    def test_invalid_input_max_seq_len(self):
        r"""Raise exception when input `max_seq_len` is invalid."""
        msg1 = (
            'Must raise `TypeError` or `ValueError` when input `max_seq_len` '
            'is invalid.')
        msg2 = 'Inconsistent error message.'
        examples = (
            False,
            True,
            0,
            1,
            -2,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises((TypeError, ValueError),
                                   msg=msg1) as cxt_man:
                BaseDataset([]).create_collate_fn(
                    tokenizer=CharDictTokenizer(), max_seq_len=invalid_input)

            if isinstance(cxt_man.exception, TypeError):
                self.assertEqual(cxt_man.exception.args[0],
                                 '`max_seq_len` must be an instance of `int`.',
                                 msg=msg2)
            else:
                self.assertEqual(
                    cxt_man.exception.args[0],
                    '`max_seq_len` must be greater than `1` or equal to '
                    '`-1`.',
                    msg=msg2)
Esempio n. 3
0
    def test_invalid_input_is_uncased(self):
        r"""Raise `TypeError` when input `is_uncased` is invalid."""
        msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.'
        msg2 = 'Inconsistent error message.'
        examples = (
            0,
            1,
            -1,
            0.0,
            1.0,
            math.nan,
            -math.nan,
            math.inf,
            -math.inf,
            0j,
            1j,
            '',
            b'',
            (),
            [],
            {},
            set(),
            object(),
            lambda x: x,
            type,
            None,
            NotImplemented,
            ...,
        )

        for invalid_input in examples:
            with self.assertRaises(TypeError, msg=msg1) as ctx_man:
                CharDictTokenizer(is_uncased=invalid_input)

            self.assertEqual(ctx_man.exception.args[0],
                             '`is_uncased` must be an instance of `bool`.',
                             msg=msg2)
Esempio n. 4
0
 def setUp(self):
     r"""Setup both cased and uncased tokenizer instances."""
     self.cased_tokenizer = CharDictTokenizer()
     self.uncased_tokenizer = CharDictTokenizer(is_uncased=True)
     self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]