def test_load_result( ws_tknzr: WsTknzr, exp_name: str, file_path: str, ): r"""Ensure configuration consistency between save and load.""" # Test case: Type mismatched. wrong_typed_inputs = [ False, True, 0, 1, 0.0, 0.1, 1.0, (), [], {}, set(), None, ..., NotImplemented, ] for bad_exp_name in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: WsTknzr.load(exp_name=bad_exp_name) assert ( '`exp_name` must be an instance of `str`' in str(excinfo.value) ) # Test case: Valid input. ws_tknzr.save(exp_name) load_tknzr = WsTknzr.load(exp_name) assert ws_tknzr.__dict__ == load_tknzr.__dict__
def test_tknz(parameters, test_input: str, expected: List[str]): r"""Tokenize text based on whitespaces.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], ) assert tknzr.tknz(test_input) == expected
def test_dtknz(parameters, test_input: List[str], expected: str): r"""Detokenize characters back to text.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], ) assert tknzr.dtknz(test_input) == expected
def test_config_file_exist( ws_tknzr: WsTknzr, exp_name: str, file_path: str, ): r"""Save configuration as file.""" ws_tknzr.save(exp_name) assert os.path.exists(file_path)
def test_dec(parameters, test_input: List[int], expected: str): r"""Decode token ids to text.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) assert (tknzr.dec(test_input, rm_sp_tks=parameters['rm_sp_tks']) == expected)
def test_config_file_format( ws_tknzr: WsTknzr, exp_name: str, file_path: str, ): r"""Saved configuration must be JSON format.""" ws_tknzr.save(exp_name) with open(file_path, 'r', encoding='utf-8') as input_file: # Raise error if file is invalid JSON. assert json.load(input_file)
def test_min_count(): r"""``min_count`` must be an integer larger than ``0``.""" # Test case: Type mismatched. wrong_typed_inputs = [ -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_min_count in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=bad_min_count, tk2id=None, ) assert '`min_count` must be an instance of `int`' in str(excinfo.value) # Test case: Invalid value. wrong_value_inputs = [-1, 0] for bad_min_count in wrong_value_inputs: with pytest.raises(ValueError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=bad_min_count, tk2id=None, ) assert '`min_count` must be larger than `0`' in str(excinfo.value) # Test case: Correct input. for good_min_count in range(1, 10): tknzr = WsTknzr( is_uncased=True, max_vocab=-1, min_count=good_min_count, tk2id=None, ) assert tknzr.min_count == good_min_count
def test_max_vocab(): r"""``max_vocab`` must be an integer larger than or equal to ``-1``.""" # Test case: Type mismatched. wrong_typed_inputs = [ -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_max_vocab in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=True, max_vocab=bad_max_vocab, min_count=1, tk2id=None, ) assert '`max_vocab` must be an instance of `int`' in str(excinfo.value) # Test case: Invalid value. with pytest.raises(ValueError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-2, min_count=1, tk2id=None, ) assert ('`max_vocab` must be larger than or equal to `-1`' in str(excinfo.value)) # Test case: Correct input. for good_max_vocab in range(-1, 10, 1): tknzr = WsTknzr( is_uncased=True, max_vocab=good_max_vocab, min_count=1, tk2id=None, ) assert tknzr.max_vocab == good_max_vocab
def test_lower_case(is_uncased: bool, cased_txt: Dict[str, str]): r"""Convert output text to lowercase when ``is_uncased == True``.""" tknzr = WsTknzr( is_uncased=is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) if tknzr.is_uncased: assert tknzr.norm(cased_txt['input']) == cased_txt['output'] else: assert tknzr.norm(cased_txt['input']) == cased_txt['input']
def test_slow_tensor_dset(max_seq_len: int) -> None: """Load dataset and convert to tensor on the fly.""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=10) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) wiki_dset = WikiText2Dset(ver='valid') dset = lmp.util.dset.SlowTensorDset(dset=wiki_dset, max_seq_len=max_seq_len, tknzr=tknzr) assert isinstance(dset, lmp.util.dset.SlowTensorDset) assert len(dset) == len(wiki_dset) for idx, tkids in enumerate(dset): assert isinstance(tkids, torch.Tensor), 'Each sample in the tensor dataset must be tensor.' assert tkids.size() == torch.Size([max_seq_len]), 'Each sample in the tensor dataset must have same length.' assert torch.all(dset[idx] == tkids), 'Support ``__getitem__`` and ``__iter__``.'
def test_enc(parameters, test_input: str, expected: List[int]): r"""Encode text to token ids.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) out = tknzr.enc(test_input, max_seq_len=parameters['max_seq_len']) assert out == expected if parameters['max_seq_len'] != -1: assert len(out) == parameters['max_seq_len']
def test_build_vocab( parameters, test_input: Sequence[str], expected: Dict[str, int], ): r"""Correctly build vocabulary under the constraint of given parameters.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) tknzr.build_vocab(test_input) assert tknzr.tk2id == expected
def test_batch_enc( parameters, test_input: List[str], expected: List[List[int]], ): r"""Encode batch of text to batch of token ids.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) outs = tknzr.batch_enc(test_input, max_seq_len=parameters['max_seq_len']) assert outs == expected if parameters['max_seq_len'] != -1: for out in outs: assert len(out) == parameters['max_seq_len']
def test_is_uncased(): r"""``is_uncased`` must be an instance of `bool`.""" # Test case: Type mismatched. wrong_typed_inputs = [ 0, 1, -1, 0.1, '', (), [], {}, set(), None, ..., NotImplemented, ] for bad_is_uncased in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=bad_is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) assert ('`is_uncased` must be an instance of `bool`' in str(excinfo.value)) # Test case: Correct input. for good_is_uncased in [False, True]: tknzr = WsTknzr( is_uncased=good_is_uncased, max_vocab=-1, min_count=1, tk2id=None, ) assert tknzr.is_uncased == good_is_uncased
def test_ws_tknzr(capsys, ws_tknzr: WsTknzr, exp_name: str, seed: int) -> None: """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.WsTknzr`.""" txt = 'a b c' lmp.script.tknz_txt.main(argv=[ '--exp_name', exp_name, '--seed', str(seed), '--txt', txt, ]) captured = capsys.readouterr() assert str(ws_tknzr.tknz(txt=txt)) in captured.out
def ws_tknzr() -> WsTknzr: r"""Common setup of whitespace tokenizer.""" return WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={ WsTknzr.bos_tk: WsTknzr.bos_tkid, WsTknzr.eos_tk: WsTknzr.eos_tkid, WsTknzr.pad_tk: WsTknzr.pad_tkid, WsTknzr.unk_tk: WsTknzr.unk_tkid, 'a': 4, 'b': 5, 'c': 6, }, )
def test_ws_tknzr( exp_name: str, is_uncased: bool, max_vocab: int, min_count: int, tknzr_file_path: str, ) -> None: """Ensure consistency between save and load.""" tknzr = WsTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr) assert os.path.exists(tknzr_file_path) load_tknzr = lmp.util.tknzr.load(exp_name=exp_name) assert isinstance(load_tknzr, WsTknzr) assert load_tknzr.is_uncased == tknzr.is_uncased assert load_tknzr.max_vocab == tknzr.max_vocab assert load_tknzr.min_count == tknzr.min_count assert load_tknzr.tk2id == tknzr.tk2id assert load_tknzr.id2tk == tknzr.id2tk
def test_vocab_size( parameters, expected: int, ): r"""``WsTknzr.vocab_size`` is an instance property Value of ``WsTknzr.vocab_size`` is the number of tokens included in the vocabulary, thus must be a postive integer. """ tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) # Check the type of `vocab_size`. assert isinstance(tknzr.vocab_size, int) # Check the value of `vocab_size`. assert tknzr.vocab_size == expected
def test_tk2id(): r"""``tk2id`` must be an dictionary which maps `str` to `int`.""" # Test case: Type mismatched. wrong_typed_inputs = [ False, True, -1, 0, 1, -1.0, 0.1, '', (), [], set(), ..., NotImplemented, ] for bad_tk2id in wrong_typed_inputs: with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=bad_tk2id, ) assert '`tk2id` must be an instance of `dict`' in str(excinfo.value) with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={1: 1}, ) assert ('All keys in `tk2id` must be instances of `str`' in str(excinfo.value)) with pytest.raises(TypeError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={'a': 'a'}, ) assert ('All values in `tk2id` must be instances of `int`' in str(excinfo.value)) # Test case: Invalid value. with pytest.raises(ValueError) as excinfo: WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id={'a': -1}, ) assert ('All values in `tk2id` must be non-negative integers' in str(excinfo.value)) # Test case: Correct input. good_tk2id = { 'a': 1, 'b': 2, } tknzr = WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=good_tk2id, ) assert tknzr.tk2id == good_tk2id # Test case: Default value. tknzr = WsTknzr( is_uncased=True, max_vocab=-1, min_count=1, tk2id=None, ) assert tknzr.tk2id == { WsTknzr.bos_tk: WsTknzr.bos_tkid, WsTknzr.eos_tk: WsTknzr.eos_tkid, WsTknzr.pad_tk: WsTknzr.pad_tkid, WsTknzr.unk_tk: WsTknzr.unk_tkid, }
def test_nfkc(ws_tknzr: WsTknzr, non_nfkc_txt: Dict[str, str]): r"""Normalize output text with NFKC.""" assert ws_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']
def ws_tknzr(exp_name: str, request, tknzr_file_path: None) -> WsTknzr: """Whitespace tokenizer example.""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr) return tknzr
def test_collapse_whitespace(ws_tknzr: WsTknzr, cws_txt: Dict[str, str]): r"""Collapse whitespaces in output text.""" assert ws_tknzr.norm(cws_txt['input']) == cws_txt['output']
def test_strip_whitespace(ws_tknzr: WsTknzr, htws_txt: Dict[str, str]): r"""Strip output text.""" assert ws_tknzr.norm(htws_txt['input']) == htws_txt['output']