def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(WhitespaceListTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(WhitespaceListTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_experiment_does_not_exist(self): r"""Raise `FileNotFoundError` when `experiment` does not exist.""" msg1 = ( 'Must raise `FileNotFoundError` when `experiment` does not exist.') msg2 = 'Inconsistent error message.' examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST') for experiment in examples: with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man: WhitespaceListTokenizer.load(experiment=experiment) test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json') self.assertEqual(ctx_man.exception.args[0], f'File {test_path} does not exist.', msg=msg2)
def test_invalid_input_experiment(self): r"""Raise exception when input `experiment` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `experiment` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises((TypeError, ValueError), msg=msg1) as ctx_man: WhitespaceListTokenizer.load(experiment=invalid_input) if isinstance(ctx_man.exception, TypeError): self.assertEqual(ctx_man.exception.args[0], '`experiment` must be an instance of `str`.', msg=msg2) else: self.assertEqual(ctx_man.exception.args[0], '`experiment` must not be empty.', msg=msg2)
def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)', ), ('', ), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual(tokenizer.vocab_size, sp_tokens_size, msg=msg)
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('Hello World !', 'I am a LEGEND .', 'Hello legend !'), 9, 8), (('y = f(x)', ), 12, 11), (('', ), 12, 11), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg)
def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('A B C D', 'a b c d'), 8, 4), (('e f g h i', 'E F G H I'), 10, 5), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg)
def test_load_result(self): r"""Load `tokenizer.json`.""" msg = 'Inconsistent `tokenizer.json` format.' examples = ( { 'is_uncased': False, 'token_to_id': ['A', 'B', 'C'] }, { 'is_uncased': True, 'token_to_id': ['a', 'b', 'c'] }, ) test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json') for obj in examples: try: # Create test file. with open(test_path, 'w', encoding='utf-8') as output_file: json.dump(obj, output_file) tokenizer = WhitespaceListTokenizer.load( experiment=self.__class__.experiment) self.assertIsInstance(tokenizer, WhitespaceListTokenizer, msg=msg) for attr_key, attr_value in obj.items(): self.assertTrue(hasattr(tokenizer, attr_key), msg=msg) self.assertIsInstance(getattr(tokenizer, attr_key), type(attr_value), msg=msg) self.assertEqual(getattr(tokenizer, attr_key), attr_value, msg=msg) finally: # Clean up test file. os.remove(test_path)
def test_invalid_input_is_uncased(self): r"""Raise `TypeError` when input `is_uncased` is invalid.""" msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises(TypeError, msg=msg1) as ctx_man: WhitespaceListTokenizer(is_uncased=invalid_input) self.assertEqual(ctx_man.exception.args[0], '`is_uncased` must be an instance of `bool`.', msg=msg2)
class TestTokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.WhitespaceListTokenizer.tokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, b'', [], (), {}, set(), object(), lambda x: x, type, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', ['0']), ('é', ['é']), ('0 é', ['0', 'é']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: out_tokens = tokenizer.tokenize(sequence) self.assertEqual(out_tokens, ans_tokens, msg=msg) for out_token in out_tokens: self.assertEqual(len(out_token), 1, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive characters when `is_uncased=False`.""" msg = ('Return result must be case-sensitive when construct with ' '`is_uncased=False`.') examples = ( ('HeLlO WoRlD!', ['HeLlO', 'WoRlD!']), ('HELLO WORLD!', ['HELLO', 'WORLD!']), ('hello world!', ['hello', 'world!']), ('H', ['H']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_cased_insensitive(self): r"""Return cased insensitive characters when `is_uncased=True`.""" msg = ('Return result must be case-insensitive when construct with ' '`is_uncased=True`.') examples = ( ('HeLlO WoRlD!', ['hello', 'world!']), ('HELLO WORLD!', ['hello', 'world!']), ('hello world!', ['hello', 'world!']), ('H', ['h']), ('h', ['h']), ) for sequence, ans_tokens in examples: self.assertEqual(self.uncased_tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = ( 'Input sequence must strip both leading and trailing whitespace ' 'characters.') examples = ( (' hello world! ', ['hello', 'world!']), (' hello world!', ['hello', 'world!']), ('hello world! ', ['hello', 'world!']), ('\nhello world!\n', ['hello', 'world!']), (' ', []), ('', []), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Input sequence must convert consecutive whitespace characters ' 'into single whitespace character.') examples = ( ('hello world !', ['hello', 'world', '!']), ('hello world !', ['hello', 'world', '!']), ('hello world !', ['hello', 'world', '!']), ('hello world\n\n!', ['hello', 'world', '!']), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.tokenize(sequence), ans_tokens, msg=msg)
class TestBuildVocab(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.build_vocab`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.build_vocab), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='batch_sequences', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty), inspect.Parameter(name='min_count', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=1), ], return_annotation=None), msg=msg) def test_invalid_input_batch_sequences(self): r"""Raise `TypeError` when input `batch_sequences` is invalid.""" msg1 = ( 'Must raise `TypeError` when input `batch_sequences` is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], ['', False], ['', True], ['', 0], ['', 1], ['', -1], ['', 0.0], ['', 1.0], ['', math.nan], ['', -math.nan], ['', math.inf], ['', -math.inf], ['', 0j], ['', 1j], ['', b''], ['', ()], ['', []], ['', {}], ['', set()], ['', object()], ['', lambda x: x], ['', type], ['', None], ['', NotImplemented], ['', ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.build_vocab(batch_sequences=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`batch_sequences` must be an instance of ' '`Iterable[str]`.', msg=msg2) def test_invalid_input_min_count(self): r"""Raise `TypeError` when input `min_count` is invalid.""" msg1 = 'Must raise `TypeError` when input `min_count` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.build_vocab(batch_sequences=[], min_count=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`min_count` must be an instance of `int`.', msg=msg2) def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('A B C D', 'a b c d'), 8, 4), (('e f g h i', 'E F G H I'), 10, 5), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg) def test_sort_by_token_frequency_in_descending_order(self): r"""Sort vocabulary by token frequency in descending order.""" msg = ('Must sort vocabulary by token frequency in descending order.') examples = ( ( ('A a A a', 'b B b', 'c C', 'd'), ('A', 'a', 'b', 'B', 'c', 'C', 'd'), ('a', 'b', 'c', 'd'), ), ( ('E e E e E', 'F f F f', 'G g G', 'H h', 'I'), ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), ('e', 'f', 'g', 'h', 'i'), ), ) for (batch_sequences, cased_vocab_order, uncased_vocab_order) in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) for (vocab1, vocab2) in zip(cased_vocab_order[:-1], cased_vocab_order[1:]): self.assertLessEqual( self.cased_tokenizer.convert_token_to_id(vocab1), self.cased_tokenizer.convert_token_to_id(vocab2), msg=msg) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) for (vocab1, vocab2) in zip(uncased_vocab_order[:-1], uncased_vocab_order[1:]): self.assertLessEqual( self.uncased_tokenizer.convert_token_to_id(vocab1), self.uncased_tokenizer.convert_token_to_id(vocab2), msg=msg) def test_min_count(self): r"""Filter out tokens whose frequency is smaller than `min_count`.""" msg = ('Must filter out tokens whose frequency is smaller than ' '`min_count`.') examples = ( ( ('A a A a', 'b B b', 'c C', 'd'), ('A', 'a', 'b'), ('B', 'c', 'C', 'd'), ('a', 'b', 'c'), ('d'), 2, ), ( ('E e E e E', 'F f F f', 'G g G', 'H h', 'I'), ('E'), ('e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), ('e', 'f', 'g'), ('h', 'i'), 3, ), ( ('E e E e E', 'F f F f', 'G g G', 'H h', 'I'), (), ('E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I'), (), ('e', 'f', 'g', 'h', 'i'), 10, ), ) for (batch_sequences, cased_known_token, cased_unknown_token, uncased_known_token, uncased_unknown_token, min_count) in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences, min_count=min_count) for token in cased_known_token: token_id = self.cased_tokenizer.convert_token_to_id(token) self.assertEqual( token, self.cased_tokenizer.convert_id_to_token(token_id), msg=msg) unk_token_id = self.cased_tokenizer.convert_token_to_id( WhitespaceListTokenizer.unk_token) for unk_token in cased_unknown_token: self.assertEqual( self.cased_tokenizer.convert_token_to_id(unk_token), unk_token_id, msg=msg) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences, min_count=min_count) for token in uncased_known_token: token_id = self.uncased_tokenizer.convert_token_to_id(token) self.assertEqual( token, self.uncased_tokenizer.convert_id_to_token(token_id), msg=msg) unk_token_id = self.uncased_tokenizer.convert_token_to_id( WhitespaceListTokenizer.unk_token) for unk_token in uncased_unknown_token: self.assertEqual( self.uncased_tokenizer.convert_token_to_id(unk_token), unk_token_id, msg=msg)
class TestEncode(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.encode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World !', 'I am a legend .', 'Hello legend !', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.encode), inspect.Signature( parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter( name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty ), inspect.Parameter( name='max_seq_len', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=-1 ) ], return_annotation=List[int] ), msg=msg ) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.encode(sequence=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2 ) def test_invalid_input_max_seq_len(self): r"""Raise exception when input `max_seq_len` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `max_seq_len` ' 'is invalid.' ) msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -2, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises( (TypeError, ValueError), msg=msg1 ) as cxt_man: tokenizer.encode( sequence='', max_seq_len=invalid_input ) if isinstance(cxt_man.exception, TypeError): self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be an instance of `int`.', msg=msg2 ) else: self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be greater than `1` or equal to ' '`-1`.', msg=msg2 ) def test_return_type(self): r"""Return `List[int]`.""" msg = 'Must return `List[int]`.' examples = ( 'Hello world !', 'I am a legend .', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: token_ids = tokenizer.encode(sequence=sequence) self.assertIsInstance(token_ids, list, msg=msg) for token_id in token_ids: self.assertIsInstance(token_id, int, msg=msg) def test_encode_format(self): r"""Follow encode format.""" msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].' examples = ( ('Hello World !', [0, 4, 7, 5, 1]), ('I am a legend .', [0, 8, 9, 10, 6, 11, 1]), ('y = f(x)', [0, 3, 3, 3, 1]), ('', [0, 1]), ) for sequence, token_ids in examples: for tokenizer in self.tokenizers: self.assertEqual( tokenizer.encode(sequence=sequence), token_ids, msg=msg ) def test_truncate(self): r"""Token ids' length must not exceed `max_seq_len`.""" msg = 'Token ids\' length must not exceed `max_seq_len`.' examples = ( ( 'Hello World !', [0, 4, 1], 3, ), ( 'I am a legend .', [0, 8, 9, 10, 1], 5, ), ( 'y = f(x)', [0, 3, 3, 1], 4, ), ( '', [0, 1], 2, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual( tokenizer.encode( sequence=sequence, max_seq_len=max_seq_len ), token_ids, msg=msg ) def test_padding(self): r"""Token ids' length must pad to `max_seq_len`.""" msg = 'Token ids\' length must pad to `max_seq_len`.' examples = ( ('Hello World !', [0, 4, 7, 5, 1, 2], 6), ('I am a legend .', [0, 8, 9, 10, 6, 11, 1, 2, 2, 2], 10), ('y = f(x)', [0, 3, 3, 3, 1], 5), ('', [0, 1, 2, 2, 2, 2, 2, 2], 8), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual( tokenizer.encode( sequence=sequence, max_seq_len=max_seq_len ), token_ids, msg=msg )
class TestDetokenize(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.detokenize`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World !', 'I am a legend .', 'Hello legend !', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.detokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input_tokens(self): r"""Raise `TypeError` when input `tokens` is invalid.""" msg1 = 'Must raise `TypeError` when input `tokens` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [False], [True], [0], [1], [-1], [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], ['', False], ['', True], ['', 0], ['', 1], ['', -1], ['', 0.0], ['', 1.0], ['', math.nan], ['', -math.nan], ['', math.inf], ['', -math.inf], ['', 0j], ['', 1j], ['', b''], ['', ()], ['', []], ['', {}], ['', set()], ['', object()], ['', lambda x: x], ['', type], ['', None], ['', NotImplemented], ['', ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.detokenize(tokens=invalid_input) self.assertEqual( ctx_man.exception.args[0], '`tokens` must be an instance of `Iterable[str]`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( ('HeLlO', 'WoRlD', '!'), (''), (), ) for tokens in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.detokenize(tokens), str, msg=msg) def test_normalize(self): r"""Return sequence is normalized.""" msg = 'Return sequence must be normalized.' examples = ( ( (' ', 'HeLlO', 'WoRlD', '!'), 'HeLlO WoRlD !', 'hello world !', ), ( ('HeLlO', 'WoRlD', '!', ' '), 'HeLlO WoRlD !', 'hello world !', ), ( (' ', ' ', 'HeLlO', ' ', ' ', 'WoRlD', '!', ' ', ' '), 'HeLlO WoRlD !', 'hello world !', ), ( ('0'), '0', '0', ), ( ('é'), unicodedata.normalize('NFKC', 'é'), unicodedata.normalize('NFKC', 'é'), ), ( ('0', 'é'), unicodedata.normalize('NFKC', '0 é'), unicodedata.normalize('NFKC', '0 é'), ), ( (), '', '', ), ) for tokens, cased_sequence, uncased_sequence in examples: self.assertEqual(self.cased_tokenizer.detokenize(tokens), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.detokenize(tokens), uncased_sequence, msg=msg)
class TestTokenize(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.tokenize`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World !', 'I am a legend .', 'Hello legend !', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello World !', 'H', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_normalize(self): r"""Return sequence is normalized.""" msg = 'Return sequence must be normalized.' examples = ( ( ' HeLlO WoRlD !', ['HeLlO', 'WoRlD', '!'], ['hello', 'world', '!'], ), ( 'HeLlO WoRlD ! ', ['HeLlO', 'WoRlD', '!'], ['hello', 'world', '!'], ), ( ' HeLlO WoRlD ! ', ['HeLlO', 'WoRlD', '!'], ['hello', 'world', '!'], ), ( '0', ['0'], ['0'], ), ( 'é', [unicodedata.normalize('NFKC', 'é')], [unicodedata.normalize('NFKC', 'é')], ), ( '0 é', [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], ), ( '', [], [], ), ) for sequence, cased_tokens, uncased_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), cased_tokens, msg=msg) self.assertEqual(self.uncased_tokenizer.tokenize(sequence), uncased_tokens, msg=msg)
class TestDetokenize(unittest.TestCase): r"""Test Case for `lmp.tokenizer.WhitespaceListTokenizer.detokenize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.detokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[str], default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input(self): r"""Raise `TypeError` when input is invalid.""" msg1 = 'Must raise `TypeError` when input is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, math.inf, True, False, (1, 2, 3), [1, 2, 3], {1, 2, 3}, None, ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.detokenize(invalid_input) self.assertEqual( ctx_man.exception.args[0], '`tokens` must be instance of `Iterable[str]`.', msg=msg2) def test_expected_return(self): r"""Return expected strings.""" msg = 'Inconsistent detokenization result.' examples = ((['Hello', 'world!'], 'Hello world!'), (['Hello', 'world', '!'], 'Hello world !'), ([], '')) for tokens, ans_sequence in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.detokenize(tokens) self.assertIsInstance(out_sequence, str, msg=msg) self.assertEqual(out_sequence, ans_sequence, msg=msg) def test_case_insensitive(self): r"""Detokenize does not consider cases.""" msg = 'Inconsistent detokenization result.' examples = ( ['HeLlo', 'WoRlD', '!'], ['hello', 'world', '!'], ) for tokens in examples: self.assertEqual(self.cased_tokenizer.detokenize(tokens), self.uncased_tokenizer.detokenize(tokens), msg=msg)
class TestVocabSize(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.vocab_size`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent property signature.' self.assertTrue(inspect.isdatadescriptor( WhitespaceListTokenizer.vocab_size), msg=msg) self.assertFalse(inspect.isfunction( WhitespaceListTokenizer.vocab_size), msg=msg) self.assertFalse(inspect.ismethod(WhitespaceListTokenizer.vocab_size), msg=msg) def test_return_type(self): r"""Return `int`""" msg = 'Must return `int`.' for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.vocab_size, int, msg=msg) def test_return_value(self): r"""Return vocabulary size.""" msg = 'Inconsistent vocabulary size.' for tokenizer in self.tokenizers: self.assertEqual(tokenizer.vocab_size, 4, msg=msg) def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('Hello World !', 'I am a LEGEND .', 'Hello legend !'), 9, 8), (('y = f(x)', ), 12, 11), (('', ), 12, 11), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual(self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg) def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)', ), ('', ), ) sp_tokens_size = len(list(WhitespaceListTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual(tokenizer.vocab_size, sp_tokens_size, msg=msg)
class TestDecode(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.decode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World !', 'I am a legend .', 'Hello legend !', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.decode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='token_ids', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[int], default=inspect.Parameter.empty), inspect.Parameter(name='remove_special_tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=bool, default=False) ], return_annotation=str), msg=msg) def test_invalid_input_token_ids(self): r"""Raise `TypeError` when input `token_ids` is invalid.""" msg1 = 'Must raise `TypeError` when input `token_ids` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [''], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], [0, 0.0], [0, 1.0], [0, math.nan], [0, -math.nan], [0, math.inf], [0, -math.inf], [0, 0j], [0, 1j], [0, ''], [0, b''], [0, ()], [0, []], [0, {}], [0, set()], [0, object()], [0, lambda x: x], [0, type], [0, None], [0, NotImplemented], [0, ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`token_ids` must be an instance of `Iterable[int]`.', msg=msg2) def test_invalid_input_remove_special_tokens(self): r"""Raise `TypeError` when input `remove_special_tokens` is invalid.""" msg1 = ('Must raise `TypeError` when input `remove_special_tokens` is ' 'invalid.') msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=[], remove_special_tokens=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`remove_special_tokens` must be an instance of `bool`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( [0, 1, 2, 3], [4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0], [], ) for token_ids in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.decode(token_ids=token_ids), str, msg=msg) def test_remove_special_tokens(self): r"""Remove special tokens.""" msg = 'Must remove special tokens.' examples = ( ( False, [0, 4, 7, 5, 1, 2], '[bos] Hello World ! [eos] [pad]', '[bos] hello world ! [eos] [pad]', ), ( False, [0, 8, 9, 10, 3, 1, 2, 2], '[bos] I am a [unk] [eos] [pad] [pad]', '[bos] i am a [unk] [eos] [pad] [pad]', ), ( False, [0, 3, 6, 11, 1], '[bos] [unk] legend . [eos]', '[bos] [unk] legend . [eos]', ), ( True, [0, 4, 7, 5, 1, 2], 'Hello World !', 'hello world !', ), ( True, [0, 8, 9, 10, 3, 1, 2, 2], 'I am a [unk]', 'i am a [unk]', ), ( True, [0, 3, 6, 11, 1], '[unk] legend .', '[unk] legend .', ), ) for (remove_special_tokens, token_ids, cased_sequence, uncased_sequence) in examples: self.assertEqual(self.cased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), uncased_sequence, msg=msg)
def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
class TestNormalize(unittest.TestCase): r"""Test case for `lmp.tokenizer.WhitespaceListTokenizer.normalize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = WhitespaceListTokenizer() self.uncased_tokenizer = WhitespaceListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(WhitespaceListTokenizer.normalize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.normalize(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.normalize(sequence=sequence), str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', '0', 1), ('é', 'é', 1), ('0é', '0é', 2), ) for sequence, normalized_sequence, sequence_len in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.normalize(sequence=sequence) self.assertEqual(out_sequence, normalized_sequence, msg=msg) self.assertEqual(len(out_sequence), sequence_len, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive sequence.""" msg = 'Return sequence must be cased sensitive.' examples = ( ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'), ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'), ('hello world!', 'hello world!', 'hello world!'), ('H', 'H', 'h'), ('h', 'h', 'h'), ) for sequence, cased_sequence, uncased_sequence in examples: self.assertEqual(self.cased_tokenizer.normalize(sequence), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.normalize(sequence), uncased_sequence, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = 'Must strip both leading and trailing whitespace characters.' examples = ( (' hello world!', 'hello world!'), ('hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), ('\nhello world!\n', 'hello world!'), (' ', ''), ('', ''), ) for sequence, stripped_sequence in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), stripped_sequence, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Must convert consecutive whitespace characters into single ' 'whitespace character.') examples = ( ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world\n\n!', 'hello world !'), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), ans_tokens, msg=msg)