def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
def test_experiment_does_not_exist(self): r"""Raise `FileNotFoundError` when `experiment` does not exist.""" msg1 = ( 'Must raise `FileNotFoundError` when `experiment` does not exist.') msg2 = 'Inconsistent error message.' examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST') for experiment in examples: with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man: CharDictTokenizer.load(experiment=experiment) test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json') self.assertEqual(ctx_man.exception.args[0], f'File {test_path} does not exist.', msg=msg2)
def test_load_result(self): r"""Load `tokenizer.json`.""" msg = 'Inconsistent `tokenizer.json` format.' examples = ( { 'is_uncased': False, 'token_to_id': { 'A': 0, 'B': 1, 'C': 2, }, 'id_to_token': { 0: 'A', 1: 'B', 2: 'C', }, }, { 'is_uncased': True, 'token_to_id': { 'a': 0, 'b': 1, 'c': 2, }, 'id_to_token': { 0: 'a', 1: 'b', 2: 'c', }, }, ) test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json') for obj in examples: try: # Create test file. with open(test_path, 'w', encoding='utf-8') as output_file: tmp = { 'is_uncased': obj['is_uncased'], 'token_to_id': obj['token_to_id'] } json.dump(tmp, output_file) tokenizer = CharDictTokenizer.load( experiment=self.__class__.experiment) self.assertIsInstance(tokenizer, CharDictTokenizer, msg=msg) for attr_key, attr_value in obj.items(): self.assertTrue(hasattr(tokenizer, attr_key), msg=msg) self.assertIsInstance(getattr(tokenizer, attr_key), type(attr_value), msg=msg) self.assertEqual(getattr(tokenizer, attr_key), attr_value, msg=msg) finally: # Clean up test file. os.remove(test_path)
def test_invalid_input_experiment(self): r"""Raise exception when input `experiment` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `experiment` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises((TypeError, ValueError), msg=msg1) as ctx_man: CharDictTokenizer.load(experiment=invalid_input) if isinstance(ctx_man.exception, TypeError): self.assertEqual(ctx_man.exception.args[0], '`experiment` must be an instance of `str`.', msg=msg2) else: self.assertEqual(ctx_man.exception.args[0], '`experiment` must not be empty.', msg=msg2)
def test_invalid_input_max_seq_len(self): r"""Raise exception when input `max_seq_len` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `max_seq_len` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -2, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises((TypeError, ValueError), msg=msg1) as cxt_man: BaseDataset([]).create_collate_fn( tokenizer=CharDictTokenizer(), max_seq_len=invalid_input) if isinstance(cxt_man.exception, TypeError): self.assertEqual(cxt_man.exception.args[0], '`max_seq_len` must be an instance of `int`.', msg=msg2) else: self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be greater than `1` or equal to ' '`-1`.', msg=msg2)
def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
def test_invalid_input_is_uncased(self): r"""Raise `TypeError` when input `is_uncased` is invalid.""" msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises(TypeError, msg=msg1) as ctx_man: CharDictTokenizer(is_uncased=invalid_input) self.assertEqual(ctx_man.exception.args[0], '`is_uncased` must be an instance of `bool`.', msg=msg2)
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )
class TestDecode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.decode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.decode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='token_ids', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[int], default=inspect.Parameter.empty), inspect.Parameter(name='remove_special_tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=bool, default=False) ], return_annotation=str), msg=msg) def test_invalid_input_token_ids(self): r"""Raise `TypeError` when input `token_ids` is invalid.""" msg1 = 'Must raise `TypeError` when input `token_ids` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [''], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], [0, 0.0], [0, 1.0], [0, math.nan], [0, -math.nan], [0, math.inf], [0, -math.inf], [0, 0j], [0, 1j], [0, ''], [0, b''], [0, ()], [0, []], [0, {}], [0, set()], [0, object()], [0, lambda x: x], [0, type], [0, None], [0, NotImplemented], [0, ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`token_ids` must be an instance of `Iterable[int]`.', msg=msg2) def test_invalid_input_remove_special_tokens(self): r"""Raise `TypeError` when input `remove_special_tokens` is invalid.""" msg1 = ('Must raise `TypeError` when input `remove_special_tokens` is ' 'invalid.') msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=[], remove_special_tokens=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`remove_special_tokens` must be an instance of `bool`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( [0, 1, 2, 3], [4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0], [], ) for token_ids in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.decode(token_ids=token_ids), str, msg=msg) def test_remove_special_tokens(self): r"""Remove special tokens.""" msg = 'Must remove special tokens.' examples = ( ( False, [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], '[bos]Hello World![eos][pad]', '[bos]hello world![eos][pad]', ), ( False, [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], '[bos]I am a [unk][eos][pad][pad]', '[bos]i am a [unk][eos][pad][pad]', ), ( False, [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], '[bos][unk]legend.[eos]', '[bos][unk]legend.[eos]', ), ( True, [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], 'Hello World!', 'hello world!', ), ( True, [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], 'I am a [unk]', 'i am a [unk]', ), ( True, [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], '[unk]legend.', '[unk]legend.', ), ) for (remove_special_tokens, token_ids, cased_sequence, uncased_sequence) in examples: self.assertEqual(self.cased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), uncased_sequence, msg=msg)
def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
class TestEncode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.encode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.encode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty), inspect.Parameter(name='max_seq_len', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=-1) ], return_annotation=List[int]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.encode(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_invalid_input_max_seq_len(self): r"""Raise exception when input `max_seq_len` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `max_seq_len` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -2, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises((TypeError, ValueError), msg=msg1) as cxt_man: tokenizer.encode(sequence='', max_seq_len=invalid_input) if isinstance(cxt_man.exception, TypeError): self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be an instance of `int`.', msg=msg2) else: self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be greater than `1` or equal to ' '`-1`.', msg=msg2) def test_return_type(self): r"""Return `List[int]`.""" msg = 'Must return `List[int]`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: token_ids = tokenizer.encode(sequence=sequence) self.assertIsInstance(token_ids, list, msg=msg) for token_id in token_ids: self.assertIsInstance(token_id, int, msg=msg) def test_encode_format(self): r"""Follow encode format.""" msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1], ), ( 'I am a legend.', [0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1], ), ( 'y = f(x)', [0, 3, 5, 3, 5, 3, 3, 3, 3, 1], ), ( '', [0, 1], ), ) for sequence, token_ids in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence), token_ids, msg=msg) def test_truncate(self): r"""Token ids' length must not exceed `max_seq_len`.""" msg = 'Token ids\' length must not exceed `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 1], 10, ), ( 'I am a legend.', [0, 14, 5, 9, 1], 5, ), ( 'y = f(x)', [0, 3, 1], 3, ), ( '', [0, 1], 2, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg) def test_padding(self): r"""Token ids' length must pad to `max_seq_len`.""" msg = 'Token ids\' length must pad to `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], 15, ), ( 'I am a legend.', [ 0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1, 2, 2, 2, 2, ], 20, ), ( 'y = f(x)', [ 0, 3, 5, 3, 5, 3, 3, 3, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ], 20, ), ( '', [0, 1, 2, 2, 2, 2, 2, 2, 2, 2], 10, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg)
class TestTokenize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.tokenize`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', 'H', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_normalize(self): r"""Return sequence is normalized.""" msg = 'Return sequence must be normalized.' examples = ( ( ' HeLlO WoRlD!', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( 'HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( ' HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( '0', ['0'], ['0'], ), ( 'é', [unicodedata.normalize('NFKC', 'é')], [unicodedata.normalize('NFKC', 'é')], ), ( '0é', [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], ), ( '', [], [], ), ) for sequence, cased_tokens, uncased_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), cased_tokens, msg=msg) self.assertEqual(self.uncased_tokenizer.tokenize(sequence), uncased_tokens, msg=msg)
class TestVocabSize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.vocab_size`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent property signature.' self.assertTrue( inspect.isdatadescriptor(CharDictTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.isfunction(CharDictTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.ismethod(CharDictTokenizer.vocab_size), msg=msg ) def test_return_type(self): r"""Return `int`""" msg = 'Must return `int`.' for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.vocab_size, int, msg=msg) def test_return_value(self): r"""Return vocabulary size.""" msg = 'Inconsistent vocabulary size.' for tokenizer in self.tokenizers: self.assertEqual(tokenizer.vocab_size, 4, msg=msg) def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg ) def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
class TestTokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.CharDictTokenizer.tokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, b'', [], (), {}, set(), object(), lambda x: x, type, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', ['0']), ('é', ['é']), ('0é', ['0', 'é']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: out_tokens = tokenizer.tokenize(sequence) self.assertEqual(out_tokens, ans_tokens, msg=msg) for out_token in out_tokens: self.assertEqual(len(out_token), 1, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive characters when `is_uncased=False`.""" msg = ('Return result must be case-sensitive when construct with ' '`is_uncased=False`.') examples = ( ('HeLlO WoRlD!', ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!']), ('HELLO WORLD!', ['H', 'E', 'L', 'L', 'O', ' ', 'W', 'O', 'R', 'L', 'D', '!']), ('hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('H', ['H']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_cased_insensitive(self): r"""Return cased insensitive characters when `is_uncased=True`.""" msg = ('Return result must be case-insensitive when construct with ' '`is_uncased=True`.') examples = ( ('HeLlO WoRlD!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('HELLO WORLD!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('H', ['h']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.uncased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = ( 'Input sequence must strip both leading and trailing whitespace ' 'characters.') examples = ( (' hello world! ', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), (' hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('hello world! ', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('\nhello world!\n', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), (' ', []), ('', []), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Input sequence must convert consecutive whitespace characters ' 'into single whitespace character.') examples = ( ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world\n\n!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg)
class TestDetokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.CharDictTokenizer.detokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.detokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, (1, 2, 3), [1, 2, 3], {1, 2, 3}, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.detokenize(invalid_input) self.assertEqual( ctx_man.exception.args[0], '`tokens` must be instance of `Iterable[str]`.', msg=msg2) def test_expected_return(self): r"""Return expected strings.""" msg = 'Inconsistent detokenization result.' examples = (([ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!' ], 'Hello world!'), ([], '')) for tokens, ans_sequence in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.detokenize(tokens) self.assertIsInstance(out_sequence, str, msg=msg) self.assertEqual(out_sequence, ans_sequence, msg=msg) def test_case_insensitive(self): r"""Detokenize does not consider cases.""" msg = 'Inconsistent detokenization result.' examples = ( ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!'], ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!'], ) for tokens in examples: self.assertEqual(self.cased_tokenizer.detokenize(tokens), self.uncased_tokenizer.detokenize(tokens), msg=msg)
class TestNormalize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.normalize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.normalize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.normalize(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.normalize(sequence=sequence), str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', '0', 1), ('é', 'é', 1), ('0é', '0é', 2), ) for sequence, normalized_sequence, sequence_len in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.normalize(sequence=sequence) self.assertEqual(out_sequence, normalized_sequence, msg=msg) self.assertEqual(len(out_sequence), sequence_len, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive sequence.""" msg = 'Return sequence must be cased sensitive.' examples = ( ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'), ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'), ('hello world!', 'hello world!', 'hello world!'), ('H', 'H', 'h'), ('h', 'h', 'h'), ) for sequence, cased_sequence, uncased_sequence in examples: self.assertEqual(self.cased_tokenizer.normalize(sequence), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.normalize(sequence), uncased_sequence, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = 'Must strip both leading and trailing whitespace characters.' examples = ( (' hello world!', 'hello world!'), ('hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), ('\nhello world!\n', 'hello world!'), (' ', ''), ('', ''), ) for sequence, stripped_sequence in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), stripped_sequence, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Must convert consecutive whitespace characters into single ' 'whitespace character.') examples = ( ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world\n\n!', 'hello world !'), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), ans_tokens, msg=msg)