def test_is_uncased(): r"""``is_uncased`` must be an instance of `bool`.""" # Test case: Type mismatched. wrong_typed_inputs = [ 0, 1, -1, 0.1, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_is_uncased in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=bad_is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) assert ( '`is_uncased` must be an instance of `bool`' in str(excinfo.value) ) # Test case: Correct input. for good_is_uncased in [False, True]: tknzr = CharTknzr( is_uncased=good_is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) assert tknzr.is_uncased == good_is_uncased
def test_tknz(parameters, test_input: str, expected: List[str]): r"""Tokenize text into characters.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], ) assert tknzr.tknz(test_input) == expected
def test_config_file_exist( char_tknzr: CharTknzr, exp_name: str, file_path: str, ): r"""Save configuration as file.""" char_tknzr.save(exp_name) assert os.path.exists(file_path)
def test_dtknz(parameters, test_input: List[str], expected: str): r"""Detokenize characters back to text.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], ) assert tknzr.dtknz(test_input) == expected
def test_config_file_format( char_tknzr: CharTknzr, exp_name: str, file_path: str, ): r"""Saved configuration must be JSON format.""" char_tknzr.save(exp_name) with open(file_path, 'r', encoding='utf-8') as input_file: # Raise error if file is invalid JSON. assert json.load(input_file)
def test_dec(parameters, test_input: List[int], expected: str): r"""Decode token ids to text.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) assert (tknzr.dec(test_input, rm_sp_tks=parameters['rm_sp_tks']) == expected)
def test_lower_case(is_uncased: bool, cased_txt: Dict[str, str]): r"""Convert output text to lowercase when ``is_uncased == True``.""" tknzr = CharTknzr( is_uncased=is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) if tknzr.is_uncased: assert tknzr.norm(cased_txt['input']) == cased_txt['output'] else: assert tknzr.norm(cased_txt['input']) == cased_txt['input']
def test_enc(parameters, test_input: str, expected: List[int]): r"""Encode text to token ids.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) out = tknzr.enc(test_input, max_seq_len=parameters['max_seq_len']) assert out == expected if parameters['max_seq_len'] != -1: assert len(out) == parameters['max_seq_len']
def test_build_vocab( parameters, test_input: Sequence[str], expected: Dict[str, int], ): r"""Correctly build vocabulary under the constraint of given parameters.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) tknzr.build_vocab(test_input) assert tknzr.tk2id == expected
def test_min_count(): r"""``min_count`` must be an integer larger than ``0``.""" # Test case: Type mismatched. wrong_typed_inputs = [ -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_min_count in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=bad_min_count, tk2id=None, ) assert '`min_count` must be an instance of `int`' in str(excinfo.value) # Test case: Invalid value. wrong_value_inputs = [-1, 0] for bad_min_count in wrong_value_inputs: with pytest.raises(ValueError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=bad_min_count, tk2id=None, ) assert '`min_count` must be larger than `0`' in str(excinfo.value) # Test case: Correct input. for good_min_count in range(1, 10): tknzr = CharTknzr( is_uncased=True, max_vocab=-1, min_count=good_min_count, tk2id=None, ) assert tknzr.min_count == good_min_count
def test_max_vocab(): r"""``max_vocab`` must be an integer larger than or equal to ``-1``.""" # Test case: Type mismatched. wrong_typed_inputs = [ -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_max_vocab in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=True, max_vocab=bad_max_vocab, min_count=1, tk2id=None, ) assert '`max_vocab` must be an instance of `int`' in str(excinfo.value) # Test case: Invalid value. with pytest.raises(ValueError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-2, min_count=1, tk2id=None, ) assert ( '`max_vocab` must be larger than or equal to `-1`' in str(excinfo.value) ) # Test case: Correct input. for good_max_vocab in range(-1, 10, 1): tknzr = CharTknzr( is_uncased=True, max_vocab=good_max_vocab, min_count=1, tk2id=None, ) assert tknzr.max_vocab == good_max_vocab
def test_batch_enc( parameters, test_input: List[str], expected: List[List[int]], ): r"""Encode batch of text to batch of token ids.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) outs = tknzr.batch_enc(test_input, max_seq_len=parameters['max_seq_len']) assert outs == expected if parameters['max_seq_len'] != -1: for out in outs: assert len(out) == parameters['max_seq_len']
def tknzr() -> BaseTknzr: r"""Example tokenizer instance.""" return CharTknzr(is_uncased=True, max_vocab=-1, min_count=1, tk2id={ '[bos]': 0, '[eos]': 1, '[pad]': 2, '[unk]': 3, 'a': 4, 'b': 5, 'c': 6, })
def test_load_result( char_tknzr: CharTknzr, exp_name: str, file_path: str, ): r"""Ensure configuration consistency between save and load.""" # Test case: Type mismatched. wrong_typed_inputs = [ False, True, 0, 1, 0.0, 0.1, 1.0, (), [], {}, set(), None, ..., NotImplemented, ] for bad_exp_name in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: CharTknzr.load(exp_name=bad_exp_name) assert ('`exp_name` must be an instance of `str`' in str(excinfo.value)) # Test case: Valid input. char_tknzr.save(exp_name) load_tknzr = CharTknzr.load(exp_name) assert char_tknzr.__dict__ == load_tknzr.__dict__
def test_char_tknzr(capsys, char_tknzr: CharTknzr, exp_name: str, seed: int) -> None: """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.CharTknzr`.""" txt = 'abc' lmp.script.tknz_txt.main(argv=[ '--exp_name', exp_name, '--seed', str(seed), '--txt', txt, ]) captured = capsys.readouterr() assert str(char_tknzr.tknz(txt=txt)) in captured.out
def char_tknzr() -> CharTknzr: r"""Common setup of character tokenizer.""" return CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={ CharTknzr.bos_tk: CharTknzr.bos_tkid, CharTknzr.eos_tk: CharTknzr.eos_tkid, CharTknzr.pad_tk: CharTknzr.pad_tkid, CharTknzr.unk_tk: CharTknzr.unk_tkid, 'a': 4, 'b': 5, 'c': 6, }, )
def tknzr() -> BaseTknzr: r"""Example tokenizer instance.""" return CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={ CharTknzr.bos_tk: CharTknzr.bos_tkid, CharTknzr.eos_tk: CharTknzr.eos_tkid, CharTknzr.pad_tk: CharTknzr.pad_tkid, CharTknzr.unk_tk: CharTknzr.unk_tkid, 'a': 4, 'b': 5, 'c': 6, }, )
def test_char_tknzr( exp_name: str, is_uncased: bool, max_vocab: int, min_count: int, tknzr_file_path: str, ) -> None: """Ensure consistency between save and load.""" tknzr = CharTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr) assert os.path.exists(tknzr_file_path) load_tknzr = lmp.util.tknzr.load(exp_name=exp_name) assert isinstance(load_tknzr, CharTknzr) assert load_tknzr.is_uncased == tknzr.is_uncased assert load_tknzr.max_vocab == tknzr.max_vocab assert load_tknzr.min_count == tknzr.min_count assert load_tknzr.tk2id == tknzr.tk2id assert load_tknzr.id2tk == tknzr.id2tk
def test_vocab_size( parameters, expected: int, ): r"""``CharTknzr.vocab_size`` is an instance property Value of ``CharTknzr.vocab_size`` is the number of tokens included in the vocabulary, thus must be a postive integer. """ tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) # Check the type of `vocab_size`. assert isinstance(tknzr.vocab_size, int) # Check the value of `vocab_size`. assert tknzr.vocab_size == expected
def char_tknzr(exp_name: str, request, tknzr_file_path: None) -> CharTknzr: """Character tokenizer example.""" tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr) return tknzr
def tknzr() -> BaseTknzr: """Max non special token is ``c``.""" tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) return tknzr
def test_nfkc(char_tknzr: CharTknzr, non_nfkc_txt: Dict[str, str]): r"""Normalize output text with NFKC.""" assert char_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']
def test_collapse_whitespace(char_tknzr: CharTknzr, cws_txt: Dict[str, str]): r"""Collapse whitespaces in output text.""" assert char_tknzr.norm(cws_txt['input']) == cws_txt['output']
def test_strip_whitespace(char_tknzr: CharTknzr, htws_txt: Dict[str, str]): r"""Strip output text.""" assert char_tknzr.norm(htws_txt['input']) == htws_txt['output']
def tknzr() -> BaseTknzr: """:py:class:`lmp.tknzr.BaseTknzr` instance.""" tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) return tknzr
def test_tk2id(): r"""``tk2id`` must be an dictionary which maps `str` to `int`.""" # Test case: Type mismatched. wrong_typed_inputs = [ False, True, -1, 0, 1, -1.0, 0.1, '', (), [], set(), ..., NotImplemented, ] for bad_tk2id in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=bad_tk2id, ) assert '`tk2id` must be an instance of `dict`' in str(excinfo.value) with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={1: 1}, ) assert ( 'All keys in `tk2id` must be instances of `str`' in str(excinfo.value) ) with pytest.raises(TypeError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={'a': 'a'}, ) assert ( 'All values in `tk2id` must be instances of `int`' in str(excinfo.value) ) # Test case: Invalid value. with pytest.raises(ValueError) as excinfo: CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={'a': -1}, ) assert ( 'All values in `tk2id` must be non-negative integers' in str(excinfo.value) ) # Test case: Correct input. good_tk2id = { 'a': 1, 'b': 2, } tknzr = CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=good_tk2id, ) assert tknzr.tk2id == good_tk2id # Test case: Default value. tknzr = CharTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=None, ) assert tknzr.tk2id == { CharTknzr.bos_tk: CharTknzr.bos_tkid, CharTknzr.eos_tk: CharTknzr.eos_tkid, CharTknzr.pad_tk: CharTknzr.pad_tkid, CharTknzr.unk_tk: CharTknzr.unk_tkid, }