Example #1
0
def test_no_limit_build() -> None:
    """Include all tokens when ``max_vocab == -1``."""
    tknzr = CharTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    CJK_txt = [chr(i) for i in range(ord('\u4e00'), ord('\u9fff') + 1)]
    norm_CJK_txt = [tknzr.norm(t) for t in CJK_txt]
    tknzr.build_vocab(CJK_txt)
    assert tknzr.vocab_size == len(set(norm_CJK_txt)) + 4
    assert all(map(lambda tk: tk in tknzr.tk2id, norm_CJK_txt))
def test_uncased(tknzr: CharTknzr, uncased_txt: Dict[str, str]) -> None:
  """Convert output text to lower cases when ``is_uncased == True``."""
  assert (tknzr.is_uncased and tknzr.norm(uncased_txt['input']) == uncased_txt['output']) or \
    (not tknzr.is_uncased and tknzr.norm(uncased_txt['input']) == uncased_txt['input'])
def test_collapse_whitespace(tknzr: CharTknzr, ws_collapse_txt: Dict[str, str]) -> None:
  """Collapse consecutive whitespaces."""
  assert tknzr.norm(ws_collapse_txt['input']) == ws_collapse_txt['output']
def test_strip_whitespace(tknzr: CharTknzr, ws_strip_txt: Dict[str, str]) -> None:
  """Strip text."""
  assert tknzr.norm(ws_strip_txt['input']) == ws_strip_txt['output']
def test_nfkc(nfkc_txt: Dict[str, str], tknzr: CharTknzr) -> None:
  """Normalize text with NFKC."""
  assert tknzr.norm(nfkc_txt['input']) == nfkc_txt['output']