def test_padding(is_uncased: bool, max_vocab: int, min_count: int) -> None: """Pad to specified length.""" assert WsTknzr( is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count, ).pad_to_max(max_seq_len=2, tkids=[]) == [ PAD_TKID, PAD_TKID, ] assert WsTknzr( is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count, ).pad_to_max(max_seq_len=5, tkids=[ BOS_TKID, UNK_TKID, EOS_TKID, ]) == [ BOS_TKID, UNK_TKID, EOS_TKID, PAD_TKID, PAD_TKID, ]
def test_no_limit_build() -> None: """Include all tokens when ``max_vocab == -1``.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0) CJK_txt = [chr(i) for i in range(ord('\u4e00'), ord('\u9fff') + 1)] norm_CJK_txt = [tknzr.norm(t) for t in CJK_txt] tknzr.build_vocab(CJK_txt) assert tknzr.vocab_size == len(set(norm_CJK_txt)) + 4 assert all(map(lambda tk: tk in tknzr.tk2id, norm_CJK_txt))
def test_empty_build() -> None: """Build nothing when given empty list.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0) tknzr.build_vocab([]) assert tknzr.vocab_size == 4 assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, }
def test_case_insensitive() -> None: """Must be case-insensitive when ``is_uncased = True``.""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(['a', 'A']) assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, 'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1, }
def test_minimum_occurrence_counts() -> None: """Must satisfy minumum occurrence counts to include tokens in vocabulary.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=2) tknzr.build_vocab(['c', 'b c', 'a b c']) assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, 'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1, 'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2, }
def test_sort_by_occurrence_counts() -> None: """Sort vocabulary by occurrence counts.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0) tknzr.build_vocab(['c', 'b c', 'a b c']) assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, 'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1, 'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2, 'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 3, }
def test_good_values(is_uncased: bool, max_vocab: int, min_count: int) -> None: """Must correctly construct tokenizer.""" tknzr = WsTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count) assert tknzr.is_uncased == is_uncased assert tknzr.max_vocab == max_vocab assert tknzr.min_count == min_count assert tknzr.tk2id == {BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID} assert tknzr.id2tk == {BOS_TKID: BOS_TK, EOS_TKID: EOS_TK, PAD_TKID: PAD_TK, UNK_TKID: UNK_TK}
def test_tknz(tknzr: WsTknzr) -> None: """Tokenize text into characters.""" # Return empty list when input empty string. assert tknzr.tknz('') == [] # Normalize with NFKC. assert tknzr.tknz('0 é') == [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é') ] # Case-sensitive and case-insensitive. assert (tknzr.is_uncased and tknzr.tknz('A B c') == ['a', 'b', 'c']) or \ (not tknzr.is_uncased and tknzr.tknz('A B c') == ['A', 'B', 'c']) # Collapse consecutive whitespaces. assert tknzr.tknz('a b c') == ['a', 'b', 'c'] # Strip whitespaces. assert tknzr.tknz(' a b c ') == ['a', 'b', 'c'] # Avoid tokenizing special tokens. assert tknzr.tknz( f'{BOS_TK} a {UNK_TK} b c {EOS_TK} {PAD_TK} {PAD_TK}') == [ BOS_TK, 'a', UNK_TK, 'b', 'c', EOS_TK, PAD_TK, PAD_TK, ]
def test_normalization() -> None: """Must normalize text first.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0) tknzr.build_vocab(['0', '0 é']) assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, unicodedata.normalize('NFKC', '0'): max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1, unicodedata.normalize('NFKC', 'é'): max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2, }
def test_arguments(is_uncased: bool, max_vocab: int, min_count: int) -> None: """Must have correct arguments.""" parser = argparse.ArgumentParser() WsTknzr.add_CLI_args(parser=parser) argv = [ '--max_vocab', str(max_vocab), '--min_count', str(min_count), ] if is_uncased: argv.append('--is_uncased') args = parser.parse_args(argv) assert args.is_uncased == is_uncased assert args.max_vocab == max_vocab assert args.min_count == min_count
def test_truncation(is_uncased: bool, max_vocab: int, min_count: int) -> None: """Truncate to specified length.""" assert WsTknzr( is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count, ).trunc_to_max(max_seq_len=5, tkids=[]) == [] assert WsTknzr( is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count, ).trunc_to_max(max_seq_len=2, tkids=[ BOS_TKID, UNK_TKID, EOS_TKID, PAD_TKID, PAD_TKID, ]) == [ BOS_TKID, UNK_TKID, ]
def test_dec() -> None: """Decode token ids to text.""" tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['A', 'a']) # Return empty string when given empty list. assert tknzr.dec(tkids=[]) == '' # Decoding format. assert tknzr.dec( tkids=[ BOS_TKID, tknzr.tk2id['a'], UNK_TKID, tknzr.tk2id['A'], EOS_TKID, PAD_TKID, ], rm_sp_tks=False, ) == f'{BOS_TK} a {UNK_TK} A {EOS_TK} {PAD_TK}' # Remove special tokens but not unknown tokens. assert tknzr.dec( tkids=[ BOS_TKID, tknzr.tk2id['a'], UNK_TKID, tknzr.tk2id['A'], UNK_TKID, EOS_TKID, PAD_TKID, ], rm_sp_tks=True, ) == f'a {UNK_TK} A {UNK_TK}' # Convert unknown id to unknown tokens. assert tknzr.dec( tkids=[ BOS_TKID, max(tknzr.tk2id.values()) + 1, max(tknzr.tk2id.values()) + 2, EOS_TKID, PAD_TKID, ], rm_sp_tks=False, ) == f'{BOS_TK} {UNK_TK} {UNK_TK} {EOS_TK} {PAD_TK}'
def test_continue_build() -> None: """Build vocabulary based on existed vocabulary.""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(['a']) tknzr.build_vocab(['b']) tknzr.build_vocab(['c']) assert tknzr.tk2id == { BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID, 'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1, 'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2, 'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 3, }
def test_dtknz(tknzr: WsTknzr) -> None: """Detokenize characters back to text.""" # Return empty string when input empty list. assert tknzr.dtknz([]) == '' # Normalize with NFKC. assert tknzr.dtknz(['0', 'é']) == unicodedata.normalize('NFKC', '0 é') # Case-sensitive and case-insensitive. assert (tknzr.is_uncased and tknzr.dtknz(['A', 'B', 'c']) == 'a b c') or \ (not tknzr.is_uncased and tknzr.dtknz(['A', 'B', 'c']) == 'A B c') # Collapse consecutive whitespaces. assert tknzr.dtknz(['a', ' ', ' ', 'b', ' ', ' ', ' ', 'c']) == 'a b c' # Strip whitespaces. assert tknzr.dtknz([' ', 'a', 'b', 'c', ' ']) == 'a b c' # Correct joint special tokens. assert tknzr.dtknz([ BOS_TK, 'a', UNK_TK, 'b', 'c', EOS_TK, PAD_TK, PAD_TK, ]) == f'{BOS_TK} a {UNK_TK} b c {EOS_TK} {PAD_TK} {PAD_TK}'
def test_uncased_enc() -> None: """Encode text to token ids (case-insensitive).""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a']) # Return `[bos] [eos]` when given empty input. assert tknzr.enc(max_seq_len=2, txt='') == [BOS_TKID, EOS_TKID] # Encoding format. assert tknzr.enc(max_seq_len=4, txt='a A') == [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID ] # Padding. assert tknzr.enc(max_seq_len=5, txt='a A') == [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID, PAD_TKID ] # Truncate. assert tknzr.enc(max_seq_len=3, txt='a A') == [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'] ] # Unknown tokens. assert tknzr.enc(max_seq_len=4, txt='b B') == [BOS_TKID, UNK_TKID, UNK_TKID, EOS_TKID] # Unknown tokens with padding. assert tknzr.enc(max_seq_len=5, txt='b B') == [ BOS_TKID, UNK_TKID, UNK_TKID, EOS_TKID, PAD_TKID ] # Unknown tokens with truncation. assert tknzr.enc(max_seq_len=2, txt='b B') == [BOS_TKID, UNK_TKID]
def tknzr(is_uncased: bool, max_vocab: int, min_count: int) -> WsTknzr: """Whitespace tokenizer shared in this module.""" return WsTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count)
def test_limit_build() -> None: """Must have correct vocabulary size.""" tknzr = WsTknzr(is_uncased=False, max_vocab=10, min_count=0) tknzr.build_vocab([chr(i) for i in range(65536)]) assert tknzr.vocab_size == 10
def test_uncased_batch_enc() -> None: """Encode batch of text to batch of token ids (case-insensitive).""" tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a']) # Return empty list when given empty list. assert tknzr.batch_enc(batch_txt=[], max_seq_len=2) == [] # Batch encoding format. assert tknzr.batch_enc(batch_txt=['a A', 'A a'], max_seq_len=4) == [ [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID, ], [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID, ], ] # Truncate and pad to specified length. assert tknzr.batch_enc(batch_txt=['a', 'a A', 'a A A'], max_seq_len=4) == [ [ BOS_TKID, tknzr.tk2id['a'], EOS_TKID, PAD_TKID, ], [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID, ], [ BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], tknzr.tk2id['a'], ], ] # Unknown tokens. assert tknzr.batch_enc(batch_txt=['a', 'a b', 'a b c'], max_seq_len=4) == [ [ BOS_TKID, tknzr.tk2id['a'], EOS_TKID, PAD_TKID, ], [ BOS_TKID, tknzr.tk2id['a'], UNK_TKID, EOS_TKID, ], [ BOS_TKID, tknzr.tk2id['a'], UNK_TKID, UNK_TKID, ], ]