def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(CharListTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(CharListTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_experiment_does_not_exist(self): r"""Raise `FileNotFoundError` when `experiment` does not exist.""" msg1 = ( 'Must raise `FileNotFoundError` when `experiment` does not exist.') msg2 = 'Inconsistent error message.' examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST') for experiment in examples: with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man: CharListTokenizer.load(experiment=experiment) test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json') self.assertEqual(ctx_man.exception.args[0], f'File {test_path} does not exist.', msg=msg2)
def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('ABCD', 'abcd'), 8, 4), (('efghi', 'EFGHI'), 10, 5), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )
def test_invalid_input_experiment(self): r"""Raise exception when input `experiment` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `experiment` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises((TypeError, ValueError), msg=msg1) as ctx_man: CharListTokenizer.load(experiment=invalid_input) if isinstance(ctx_man.exception, TypeError): self.assertEqual(ctx_man.exception.args[0], '`experiment` must be an instance of `str`.', msg=msg2) else: self.assertEqual(ctx_man.exception.args[0], '`experiment` must not be empty.', msg=msg2)
def test_load_result(self): r"""Load `tokenizer.json`.""" msg = 'Inconsistent `tokenizer.json` format.' examples = ( { 'is_uncased': False, 'token_to_id': { 'A': 0, 'B': 1, 'C': 2, }, }, { 'is_uncased': True, 'token_to_id': { 'a': 0, 'b': 1, 'c': 2, }, }, ) test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json') for obj in examples: try: # Create test file. with open(test_path, 'w', encoding='utf-8') as output_file: json.dump(obj, output_file) tokenizer = CharListTokenizer.load( experiment=self.__class__.experiment) self.assertIsInstance(tokenizer, CharListTokenizer, msg=msg) for attr_key, attr_value in obj.items(): self.assertTrue(hasattr(tokenizer, attr_key), msg=msg) self.assertIsInstance(getattr(tokenizer, attr_key), type(attr_value), msg=msg) self.assertEqual(getattr(tokenizer, attr_key), attr_value, msg=msg) finally: # Clean up test file. os.remove(test_path)
def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
def test_invalid_input_is_uncased(self): r"""Raise `TypeError` when input `is_uncased` is invalid.""" msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises(TypeError, msg=msg1) as ctx_man: CharListTokenizer(is_uncased=invalid_input) self.assertEqual(ctx_man.exception.args[0], '`is_uncased` must be an instance of `bool`.', msg=msg2)
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )
def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
class TestTokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.CharListTokenizer.tokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, b'', [], (), {}, set(), object(), lambda x: x, type, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', ['0']), ('é', ['é']), ('0é', ['0', 'é']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: out_tokens = tokenizer.tokenize(sequence) self.assertEqual(out_tokens, ans_tokens, msg=msg) for out_token in out_tokens: self.assertEqual(len(out_token), 1, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive characters when `is_uncased=False`.""" msg = ('Return result must be case-sensitive when construct with ' '`is_uncased=False`.') examples = ( ('HeLlO WoRlD!', ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!']), ('HELLO WORLD!', ['H', 'E', 'L', 'L', 'O', ' ', 'W', 'O', 'R', 'L', 'D', '!']), ('hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('H', ['H']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_cased_insensitive(self): r"""Return cased insensitive characters when `is_uncased=True`.""" msg = ('Return result must be case-insensitive when construct with ' '`is_uncased=True`.') examples = ( ('HeLlO WoRlD!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('HELLO WORLD!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('H', ['h']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.uncased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = ( 'Input sequence must strip both leading and trailing whitespace ' 'characters.') examples = ( (' hello world! ', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), (' hello world!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('hello world! ', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), ('\nhello world!\n', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']), (' ', []), ('', []), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Input sequence must convert consecutive whitespace characters ' 'into single whitespace character.') examples = ( ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world !', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ('hello world\n\n!', ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', ' ', '!']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg)
class TestNormalize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.normalize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.normalize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.normalize(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.normalize(sequence=sequence), str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', '0', 1), ('é', 'é', 1), ('0é', '0é', 2), ) for sequence, normalized_sequence, sequence_len in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.normalize(sequence=sequence) self.assertEqual(out_sequence, normalized_sequence, msg=msg) self.assertEqual(len(out_sequence), sequence_len, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive sequence.""" msg = 'Return sequence must be cased sensitive.' examples = ( ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'), ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'), ('hello world!', 'hello world!', 'hello world!'), ('H', 'H', 'h'), ('h', 'h', 'h'), ) for sequence, cased_sequence, uncased_sequence in examples: self.assertEqual(self.cased_tokenizer.normalize(sequence), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.normalize(sequence), uncased_sequence, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = 'Must strip both leading and trailing whitespace characters.' examples = ( (' hello world!', 'hello world!'), ('hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), ('\nhello world!\n', 'hello world!'), (' ', ''), ('', ''), ) for sequence, stripped_sequence in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), stripped_sequence, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Must convert consecutive whitespace characters into single ' 'whitespace character.') examples = ( ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world\n\n!', 'hello world !'), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), ans_tokens, msg=msg)
class TestEncode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.encode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.encode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty), inspect.Parameter(name='max_seq_len', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=-1) ], return_annotation=List[int]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.encode(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_invalid_input_max_seq_len(self): r"""Raise exception when input `max_seq_len` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `max_seq_len` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -2, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises((TypeError, ValueError), msg=msg1) as cxt_man: tokenizer.encode(sequence='', max_seq_len=invalid_input) if isinstance(cxt_man.exception, TypeError): self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be an instance of `int`.', msg=msg2) else: self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be greater than `1` or equal to ' '`-1`.', msg=msg2) def test_return_type(self): r"""Return `List[int]`.""" msg = 'Must return `List[int]`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: token_ids = tokenizer.encode(sequence=sequence) self.assertIsInstance(token_ids, list, msg=msg) for token_id in token_ids: self.assertIsInstance(token_id, int, msg=msg) def test_encode_format(self): r"""Follow encode format.""" msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1], ), ( 'I am a legend.', [0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1], ), ( 'y = f(x)', [0, 3, 5, 3, 5, 3, 3, 3, 3, 1], ), ( '', [0, 1], ), ) for sequence, token_ids in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence), token_ids, msg=msg) def test_truncate(self): r"""Token ids' length must not exceed `max_seq_len`.""" msg = 'Token ids\' length must not exceed `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 1], 10, ), ( 'I am a legend.', [0, 14, 5, 9, 1], 5, ), ( 'y = f(x)', [0, 3, 1], 3, ), ( '', [0, 1], 2, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg) def test_padding(self): r"""Token ids' length must pad to `max_seq_len`.""" msg = 'Token ids\' length must pad to `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], 15, ), ( 'I am a legend.', [ 0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1, 2, 2, 2, 2, ], 20, ), ( 'y = f(x)', [ 0, 3, 5, 3, 5, 3, 3, 3, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ], 20, ), ( '', [0, 1, 2, 2, 2, 2, 2, 2, 2, 2], 10, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg)
class TestVocabSize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.vocab_size`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent property signature.' self.assertTrue( inspect.isdatadescriptor(CharListTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.isfunction(CharListTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.ismethod(CharListTokenizer.vocab_size), msg=msg ) def test_return_type(self): r"""Return `int`""" msg = 'Must return `int`.' for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.vocab_size, int, msg=msg) def test_return_value(self): r"""Return vocabulary size.""" msg = 'Inconsistent vocabulary size.' for tokenizer in self.tokenizers: self.assertEqual(tokenizer.vocab_size, 4, msg=msg) def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg ) def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
class TestBatchDecode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.batch_decode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.batch_decode), inspect.Signature( parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter( name='batch_token_ids', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[Iterable[int]], default=inspect.Parameter.empty ), inspect.Parameter( name='remove_special_tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=bool, default=False ) ], return_annotation=List[str] ), msg=msg ) def test_invalid_input_batch_token_ids(self): r"""Raise `TypeError` when input `batch_token_ids` is invalid.""" msg1 = ( 'Must raise `TypeError` when input `batch_token_ids` is invalid.' ) msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], [[], False], [[], True], [[], 0], [[], 1], [[], -1], [[], 0.0], [[], 1.0], [[], math.nan], [[], -math.nan], [[], math.inf], [[], -math.inf], [[], 0j], [[], 1j], [[], object()], [[], lambda x: x], [[], type], [[], None], [[], NotImplemented], [[], ...], [[0.0]], [[1.0]], [[math.nan]], [[-math.nan]], [[math.inf]], [[-math.inf]], [[0j]], [[1j]], [['']], [[b'']], [[()]], [[[]]], [[{}]], [[set()]], [[object()]], [[lambda x: x]], [[type]], [[None]], [[NotImplemented]], [[...]], [[0, 0.0]], [[0, 1.0]], [[0, math.nan]], [[0, -math.nan]], [[0, math.inf]], [[0, -math.inf]], [[0, 0j]], [[0, 1j]], [[0, '']], [[0, b'']], [[0, ()]], [[0, []]], [[0, {}]], [[0, set()]], [[0, object()]], [[0, lambda x: x]], [[0, type]], [[0, None]], [[0, NotImplemented]], [[0, ...]], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.batch_decode(batch_token_ids=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`batch_token_ids` must be an instance of ' '`Iterable[Iterable[int]]`.', msg=msg2 ) def test_invalid_input_remove_special_tokens(self): r"""Raise `TypeError` when input `remove_special_tokens` is invalid.""" msg1 = ( 'Must raise `TypeError` when input `remove_special_tokens` is ' 'invalid.' ) msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.batch_decode( batch_token_ids=[[]], remove_special_tokens=invalid_input ) self.assertEqual( cxt_man.exception.args[0], '`remove_special_tokens` must be an instance of `bool`.', msg=msg2 ) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( [[0, 1, 2, 3], [4, 5, 6, 7, 8]], [[9, 10, 11, 12, 13], []], [[], [14, 15, 16, 17]], [[], []], [], ) for batch_token_ids in examples: for tokenizer in self.tokenizers: batch_sequences = tokenizer.batch_decode( batch_token_ids=batch_token_ids ) self.assertIsInstance(batch_sequences, list, msg=msg) for sequence in batch_sequences: self.assertIsInstance(sequence, str, msg=msg) def test_remove_special_tokens(self): r"""Remove special tokens.""" msg = 'Must remove special tokens.' examples = ( ( False, [ [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], ], [ '[bos]Hello World![eos][pad]', '[bos]I am a [unk][eos][pad][pad]', '[bos][unk]legend.[eos]', ], [ '[bos]hello world![eos][pad]', '[bos]i am a [unk][eos][pad][pad]', '[bos][unk]legend.[eos]', ], ), ( True, [ [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], ], [ 'Hello World!', 'I am a [unk]', '[unk]legend.', ], [ 'hello world!', 'i am a [unk]', '[unk]legend.', ], ), ) for ( remove_special_tokens, batch_token_ids, cased_batch_sequence, uncased_batch_sequence ) in examples: self.assertEqual( self.cased_tokenizer.batch_decode( batch_token_ids=batch_token_ids, remove_special_tokens=remove_special_tokens ), cased_batch_sequence, msg=msg ) self.assertEqual( self.uncased_tokenizer.batch_decode( batch_token_ids=batch_token_ids, remove_special_tokens=remove_special_tokens ), uncased_batch_sequence, msg=msg )
class TestTokenize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.tokenize`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.tokenize), inspect.Signature( parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty ), inspect.Parameter( name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty ) ], return_annotation=List[str] ), msg=msg ) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual( ctx_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2 ) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', 'H', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_normalize(self): r"""Return sequence is normalized.""" msg = 'Return sequence must be normalized.' examples = ( ( ' HeLlO WoRlD!', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( 'HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( ' HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( '0', ['0'], ['0'], ), ( 'é', [unicodedata.normalize('NFKC', 'é')], [unicodedata.normalize('NFKC', 'é')], ), ( '0é', [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], ), ( '', [], [], ), ) for sequence, cased_tokens, uncased_tokens in examples: self.assertEqual( self.cased_tokenizer.tokenize(sequence), cased_tokens, msg=msg ) self.assertEqual( self.uncased_tokenizer.tokenize(sequence), uncased_tokens, msg=msg )
class TestBuildVocab(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharListTokenizer.build_vocab`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.build_vocab), inspect.Signature( parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty ), inspect.Parameter( name='batch_sequences', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty ), inspect.Parameter( name='min_count', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=1 ), ], return_annotation=None ), msg=msg ) def test_invalid_input_batch_sequences(self): r"""Raise `TypeError` when input `batch_sequences` is invalid.""" msg1 = ( 'Must raise `TypeError` when input `batch_sequences` is invalid.' ) msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], ['', False], ['', True], ['', 0], ['', 1], ['', -1], ['', 0.0], ['', 1.0], ['', math.nan], ['', -math.nan], ['', math.inf], ['', -math.inf], ['', 0j], ['', 1j], ['', b''], ['', ()], ['', []], ['', {}], ['', set()], ['', object()], ['', lambda x: x], ['', type], ['', None], ['', NotImplemented], ['', ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.build_vocab(batch_sequences=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`batch_sequences` must be an instance of ' '`Iterable[str]`.', msg=msg2 ) def test_invalid_input_min_count(self): r"""Raise `TypeError` when input `min_count` is invalid.""" msg1 = 'Must raise `TypeError` when input `min_count` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.build_vocab( batch_sequences=[], min_count=invalid_input ) self.assertEqual( cxt_man.exception.args[0], '`min_count` must be an instance of `int`.', msg=msg2 ) def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('ABCD', 'abcd'), 8, 4), (('efghi', 'EFGHI'), 10, 5), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg ) def test_sort_by_token_frequency_in_descending_order(self): r"""Sort vocabulary by token frequency in descending order.""" msg = ( 'Must sort vocabulary by token frequency in descending order.' ) examples = ( ( ('AaAa', 'bBb', 'cC', 'd'), ('A', 'a', 'b', 'B', 'c', 'C', 'd'), ('a', 'b', 'c', 'd'), ), ( ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'), ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), ('e', 'f', 'g', 'h', 'i'), ), ) for ( batch_sequences, cased_vocab_order, uncased_vocab_order ) in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) for ( vocab1, vocab2 ) in zip(cased_vocab_order[:-1], cased_vocab_order[1:]): self.assertLessEqual( self.cased_tokenizer.convert_token_to_id(vocab1), self.cased_tokenizer.convert_token_to_id(vocab2), msg=msg ) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) for ( vocab1, vocab2 ) in zip(uncased_vocab_order[:-1], uncased_vocab_order[1:]): self.assertLessEqual( self.uncased_tokenizer.convert_token_to_id(vocab1), self.uncased_tokenizer.convert_token_to_id(vocab2), msg=msg ) def test_min_count(self): r"""Filter out tokens whose frequency is smaller than `min_count`.""" msg = ( 'Must filter out tokens whose frequency is smaller than ' '`min_count`.' ) examples = ( ( ('AaAa', 'bBb', 'cC', 'd'), ('A', 'a', 'b'), ('B', 'c', 'C', 'd'), ('a', 'b', 'c'), ('d'), 2, ), ( ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'), ('E'), ('e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), ('e', 'f', 'g'), ('h', 'i'), 3, ), ( ('EeEeE', 'FfFf', 'GgG', 'Hh', 'I'), (), ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), (), ('e', 'f', 'g', 'h', 'i'), 10, ), ) for ( batch_sequences, cased_known_token, cased_unknown_token, uncased_known_token, uncased_unknown_token, min_count ) in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab( batch_sequences=batch_sequences, min_count=min_count ) for token in cased_known_token: token_id = self.cased_tokenizer.convert_token_to_id(token) self.assertEqual( token, self.cased_tokenizer.convert_id_to_token(token_id), msg=msg ) unk_token_id = self.cased_tokenizer.convert_token_to_id( CharListTokenizer.unk_token ) for unk_token in cased_unknown_token: self.assertEqual( self.cased_tokenizer.convert_token_to_id(unk_token), unk_token_id, msg=msg ) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab( batch_sequences=batch_sequences, min_count=min_count ) for token in uncased_known_token: token_id = self.uncased_tokenizer.convert_token_to_id(token) self.assertEqual( token, self.uncased_tokenizer.convert_id_to_token(token_id), msg=msg ) unk_token_id = self.uncased_tokenizer.convert_token_to_id( CharListTokenizer.unk_token ) for unk_token in uncased_unknown_token: self.assertEqual( self.uncased_tokenizer.convert_token_to_id(unk_token), unk_token_id, msg=msg )
class TestDetokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.CharListTokenizer.detokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharListTokenizer() self.uncased_tokenizer = CharListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharListTokenizer.detokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, (1, 2, 3), [1, 2, 3], {1, 2, 3}, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.detokenize(invalid_input) self.assertEqual( ctx_man.exception.args[0], '`tokens` must be instance of `Iterable[str]`.', msg=msg2) def test_expected_return(self): r"""Return expected strings.""" msg = 'Inconsistent detokenization result.' examples = (([ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!' ], 'Hello world!'), ([], '')) for tokens, ans_sequence in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.detokenize(tokens) self.assertIsInstance(out_sequence, str, msg=msg) self.assertEqual(out_sequence, ans_sequence, msg=msg) def test_case_insensitive(self): r"""Detokenize does not consider cases.""" msg = 'Inconsistent detokenization result.' examples = ( ['H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!'], ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!'], ) for tokens in examples: self.assertEqual(self.cased_tokenizer.detokenize(tokens), self.uncased_tokenizer.detokenize(tokens), msg=msg)