def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(BaseDictTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(BaseDictTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_experiment_does_not_exist(self): r"""Raise `FileNotFoundError` when `experiment` does not exist.""" msg1 = ( 'Must raise `FileNotFoundError` when `experiment` does not exist.') msg2 = 'Inconsistent error message.' examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST') for experiment in examples: with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man: BaseDictTokenizer.load(experiment=experiment) test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json') self.assertEqual(ctx_man.exception.args[0], f'File {test_path} does not exist.', msg=msg2)
def test_load_result(self): r"""Load `tokenizer.json`.""" msg = 'Inconsistent `tokenizer.json` format.' examples = ( { 'is_uncased': False, 'token_to_id': { 'A': 0, 'B': 1, 'C': 2, }, 'id_to_token': { 0: 'A', 1: 'B', 2: 'C', }, }, { 'is_uncased': True, 'token_to_id': { 'a': 0, 'b': 1, 'c': 2, }, 'id_to_token': { 0: 'a', 1: 'b', 2: 'c', }, }, ) test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json') for obj in examples: try: # Create test file. with open(test_path, 'w', encoding='utf-8') as output_file: tmp = { 'is_uncased': obj['is_uncased'], 'token_to_id': obj['token_to_id'] } json.dump(tmp, output_file) tokenizer = BaseDictTokenizer.load( experiment=self.__class__.experiment) self.assertIsInstance(tokenizer, BaseDictTokenizer, msg=msg) for attr_key, attr_value in obj.items(): self.assertTrue(hasattr(tokenizer, attr_key), msg=msg) self.assertIsInstance(getattr(tokenizer, attr_key), type(attr_value), msg=msg) self.assertEqual(getattr(tokenizer, attr_key), attr_value, msg=msg) finally: # Clean up test file. os.remove(test_path)
def test_invalid_input_experiment(self): r"""Raise exception when input `experiment` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `experiment` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises((TypeError, ValueError), msg=msg1) as ctx_man: BaseDictTokenizer.load(experiment=invalid_input) if isinstance(ctx_man.exception, TypeError): self.assertEqual(ctx_man.exception.args[0], '`experiment` must be an instance of `str`.', msg=msg2) else: self.assertEqual(ctx_man.exception.args[0], '`experiment` must not be empty.', msg=msg2)
def test_abstract_method(self): r"""Raise `NotImplementedError` when subclass did not implement.""" msg1 = ( 'Must raise `NotImplementedError` when subclass did not implement.' ) msg2 = 'Inconsistent error message.' examples = (True, False) for is_uncased in examples: with self.assertRaises(NotImplementedError, msg=msg1) as ctx_man: BaseDictTokenizer(is_uncased=is_uncased).tokenize('') self.assertEqual(ctx_man.exception.args[0], 'In class `BaseDictTokenizer`: ' 'method `tokenize` not implemented yet.', msg=msg2)
def test_invalid_input_is_uncased(self): r"""Raise `TypeError` when input `is_uncased` is invalid.""" msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises(TypeError, msg=msg1) as ctx_man: BaseDictTokenizer(is_uncased=invalid_input) self.assertEqual(ctx_man.exception.args[0], '`is_uncased` must be an instance of `bool`.', msg=msg2)
def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = BaseDictTokenizer() self.uncased_tokenizer = BaseDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
class TestNormalize(unittest.TestCase): r"""Test case for `lmp.tokenizer.BaseDictTokenizer.normalize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = BaseDictTokenizer() self.uncased_tokenizer = BaseDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(BaseDictTokenizer.normalize), inspect.Signature( parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty ), inspect.Parameter( name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty ) ], return_annotation=str ), msg=msg ) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.normalize(sequence=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2 ) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: self.assertIsInstance( tokenizer.normalize(sequence=sequence), str, msg=msg ) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', '0', 1), ('é', 'é', 1), ('0é', '0é', 2), ) for sequence, normalized_sequence, sequence_len in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.normalize(sequence=sequence) self.assertEqual(out_sequence, normalized_sequence, msg=msg) self.assertEqual(len(out_sequence), sequence_len, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive sequence.""" msg = 'Return sequence must be cased sensitive.' examples = ( ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'), ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'), ('hello world!', 'hello world!', 'hello world!'), ('H', 'H', 'h'), ('h', 'h', 'h'), ) for sequence, cased_sequence, uncased_sequence in examples: self.assertEqual( self.cased_tokenizer.normalize(sequence), cased_sequence, msg=msg ) self.assertEqual( self.uncased_tokenizer.normalize(sequence), uncased_sequence, msg=msg ) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = 'Must strip both leading and trailing whitespace characters.' examples = ( (' hello world!', 'hello world!'), ('hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), ('\nhello world!\n', 'hello world!'), (' ', ''), ('', ''), ) for sequence, stripped_sequence in examples: for tokenizer in self.tokenizers: self.assertEqual( tokenizer.normalize(sequence), stripped_sequence, msg=msg ) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ( 'Must convert consecutive whitespace characters into single ' 'whitespace character.' ) examples = ( ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world\n\n!', 'hello world !'), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual( tokenizer.normalize(sequence), ans_tokens, msg=msg )