def test_no_limit_build() -> None: """Include all tokens when ``max_vocab == -1``.""" tknzr = CharTknzr(is_uncased=False, max_vocab=-1, min_count=0) CJK_txt = [chr(i) for i in range(ord('\u4e00'), ord('\u9fff') + 1)] norm_CJK_txt = [tknzr.norm(t) for t in CJK_txt] tknzr.build_vocab(CJK_txt) assert tknzr.vocab_size == len(set(norm_CJK_txt)) + 4 assert all(map(lambda tk: tk in tknzr.tk2id, norm_CJK_txt))
def test_uncased(tknzr: CharTknzr, uncased_txt: Dict[str, str]) -> None: """Convert output text to lower cases when ``is_uncased == True``.""" assert (tknzr.is_uncased and tknzr.norm(uncased_txt['input']) == uncased_txt['output']) or \ (not tknzr.is_uncased and tknzr.norm(uncased_txt['input']) == uncased_txt['input'])
def test_collapse_whitespace(tknzr: CharTknzr, ws_collapse_txt: Dict[str, str]) -> None: """Collapse consecutive whitespaces.""" assert tknzr.norm(ws_collapse_txt['input']) == ws_collapse_txt['output']
def test_strip_whitespace(tknzr: CharTknzr, ws_strip_txt: Dict[str, str]) -> None: """Strip text.""" assert tknzr.norm(ws_strip_txt['input']) == ws_strip_txt['output']
def test_nfkc(nfkc_txt: Dict[str, str], tknzr: CharTknzr) -> None: """Normalize text with NFKC.""" assert tknzr.norm(nfkc_txt['input']) == nfkc_txt['output']