def test_padding(is_uncased: bool, max_vocab: int, min_count: int) -> None:
    """Pad to specified length."""
    assert WsTknzr(
        is_uncased=is_uncased,
        max_vocab=max_vocab,
        min_count=min_count,
    ).pad_to_max(max_seq_len=2, tkids=[]) == [
        PAD_TKID,
        PAD_TKID,
    ]
    assert WsTknzr(
        is_uncased=is_uncased,
        max_vocab=max_vocab,
        min_count=min_count,
    ).pad_to_max(max_seq_len=5, tkids=[
        BOS_TKID,
        UNK_TKID,
        EOS_TKID,
    ]) == [
        BOS_TKID,
        UNK_TKID,
        EOS_TKID,
        PAD_TKID,
        PAD_TKID,
    ]
def test_no_limit_build() -> None:
    """Include all tokens when ``max_vocab == -1``."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    CJK_txt = [chr(i) for i in range(ord('\u4e00'), ord('\u9fff') + 1)]
    norm_CJK_txt = [tknzr.norm(t) for t in CJK_txt]
    tknzr.build_vocab(CJK_txt)
    assert tknzr.vocab_size == len(set(norm_CJK_txt)) + 4
    assert all(map(lambda tk: tk in tknzr.tk2id, norm_CJK_txt))
def test_empty_build() -> None:
    """Build nothing when given empty list."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    tknzr.build_vocab([])
    assert tknzr.vocab_size == 4
    assert tknzr.tk2id == {
        BOS_TK: BOS_TKID,
        EOS_TK: EOS_TKID,
        PAD_TK: PAD_TKID,
        UNK_TK: UNK_TKID,
    }
def test_case_insensitive() -> None:
    """Must be case-insensitive when ``is_uncased = True``."""
    tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(['a', 'A'])
    assert tknzr.tk2id == {
        BOS_TK: BOS_TKID,
        EOS_TK: EOS_TKID,
        PAD_TK: PAD_TKID,
        UNK_TK: UNK_TKID,
        'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1,
    }
def test_minimum_occurrence_counts() -> None:
    """Must satisfy minumum occurrence counts to include tokens in vocabulary."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=2)
    tknzr.build_vocab(['c', 'b c', 'a b c'])
    assert tknzr.tk2id == {
        BOS_TK: BOS_TKID,
        EOS_TK: EOS_TKID,
        PAD_TK: PAD_TKID,
        UNK_TK: UNK_TKID,
        'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1,
        'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2,
    }
def test_sort_by_occurrence_counts() -> None:
    """Sort vocabulary by occurrence counts."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    tknzr.build_vocab(['c', 'b c', 'a b c'])
    assert tknzr.tk2id == {
        BOS_TK: BOS_TKID,
        EOS_TK: EOS_TKID,
        PAD_TK: PAD_TKID,
        UNK_TK: UNK_TKID,
        'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1,
        'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2,
        'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 3,
    }
def test_good_values(is_uncased: bool, max_vocab: int, min_count: int) -> None:
  """Must correctly construct tokenizer."""
  tknzr = WsTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count)
  assert tknzr.is_uncased == is_uncased
  assert tknzr.max_vocab == max_vocab
  assert tknzr.min_count == min_count
  assert tknzr.tk2id == {BOS_TK: BOS_TKID, EOS_TK: EOS_TKID, PAD_TK: PAD_TKID, UNK_TK: UNK_TKID}
  assert tknzr.id2tk == {BOS_TKID: BOS_TK, EOS_TKID: EOS_TK, PAD_TKID: PAD_TK, UNK_TKID: UNK_TK}
def test_tknz(tknzr: WsTknzr) -> None:
    """Tokenize text into characters."""
    # Return empty list when input empty string.
    assert tknzr.tknz('') == []
    # Normalize with NFKC.
    assert tknzr.tknz('0 é') == [
        unicodedata.normalize('NFKC', '0'),
        unicodedata.normalize('NFKC', 'é')
    ]
    # Case-sensitive and case-insensitive.
    assert (tknzr.is_uncased and tknzr.tknz('A B c') == ['a', 'b', 'c']) or \
      (not tknzr.is_uncased and tknzr.tknz('A B c') == ['A', 'B', 'c'])
    # Collapse consecutive whitespaces.
    assert tknzr.tknz('a  b   c') == ['a', 'b', 'c']
    # Strip whitespaces.
    assert tknzr.tknz('  a b c  ') == ['a', 'b', 'c']
    # Avoid tokenizing special tokens.
    assert tknzr.tknz(
        f'{BOS_TK} a {UNK_TK} b c {EOS_TK} {PAD_TK} {PAD_TK}') == [
            BOS_TK,
            'a',
            UNK_TK,
            'b',
            'c',
            EOS_TK,
            PAD_TK,
            PAD_TK,
        ]
def test_normalization() -> None:
    """Must normalize text first."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    tknzr.build_vocab(['0', '0 é'])
    assert tknzr.tk2id == {
        BOS_TK:
        BOS_TKID,
        EOS_TK:
        EOS_TKID,
        PAD_TK:
        PAD_TKID,
        UNK_TK:
        UNK_TKID,
        unicodedata.normalize('NFKC', '0'):
        max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1,
        unicodedata.normalize('NFKC', 'é'):
        max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2,
    }
def test_arguments(is_uncased: bool, max_vocab: int, min_count: int) -> None:
    """Must have correct arguments."""
    parser = argparse.ArgumentParser()
    WsTknzr.add_CLI_args(parser=parser)
    argv = [
        '--max_vocab',
        str(max_vocab),
        '--min_count',
        str(min_count),
    ]

    if is_uncased:
        argv.append('--is_uncased')

    args = parser.parse_args(argv)

    assert args.is_uncased == is_uncased
    assert args.max_vocab == max_vocab
    assert args.min_count == min_count
def test_truncation(is_uncased: bool, max_vocab: int, min_count: int) -> None:
    """Truncate to specified length."""
    assert WsTknzr(
        is_uncased=is_uncased,
        max_vocab=max_vocab,
        min_count=min_count,
    ).trunc_to_max(max_seq_len=5, tkids=[]) == []
    assert WsTknzr(
        is_uncased=is_uncased,
        max_vocab=max_vocab,
        min_count=min_count,
    ).trunc_to_max(max_seq_len=2,
                   tkids=[
                       BOS_TKID,
                       UNK_TKID,
                       EOS_TKID,
                       PAD_TKID,
                       PAD_TKID,
                   ]) == [
                       BOS_TKID,
                       UNK_TKID,
                   ]
Beispiel #12
0
def test_dec() -> None:
    """Decode token ids to text."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=-1, min_count=0)
    tknzr.build_vocab(batch_txt=['A', 'a'])

    # Return empty string when given empty list.
    assert tknzr.dec(tkids=[]) == ''

    # Decoding format.
    assert tknzr.dec(
        tkids=[
            BOS_TKID,
            tknzr.tk2id['a'],
            UNK_TKID,
            tknzr.tk2id['A'],
            EOS_TKID,
            PAD_TKID,
        ],
        rm_sp_tks=False,
    ) == f'{BOS_TK} a {UNK_TK} A {EOS_TK} {PAD_TK}'

    # Remove special tokens but not unknown tokens.
    assert tknzr.dec(
        tkids=[
            BOS_TKID,
            tknzr.tk2id['a'],
            UNK_TKID,
            tknzr.tk2id['A'],
            UNK_TKID,
            EOS_TKID,
            PAD_TKID,
        ],
        rm_sp_tks=True,
    ) == f'a {UNK_TK} A {UNK_TK}'

    # Convert unknown id to unknown tokens.
    assert tknzr.dec(
        tkids=[
            BOS_TKID,
            max(tknzr.tk2id.values()) + 1,
            max(tknzr.tk2id.values()) + 2,
            EOS_TKID,
            PAD_TKID,
        ],
        rm_sp_tks=False,
    ) == f'{BOS_TK} {UNK_TK} {UNK_TK} {EOS_TK} {PAD_TK}'
def test_continue_build() -> None:
    """Build vocabulary based on existed vocabulary."""
    tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(['a'])
    tknzr.build_vocab(['b'])
    tknzr.build_vocab(['c'])
    assert tknzr.tk2id == {
        BOS_TK: BOS_TKID,
        EOS_TK: EOS_TKID,
        PAD_TK: PAD_TKID,
        UNK_TK: UNK_TKID,
        'a': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 1,
        'b': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 2,
        'c': max(BOS_TKID, EOS_TKID, PAD_TKID, UNK_TKID) + 3,
    }
def test_dtknz(tknzr: WsTknzr) -> None:
    """Detokenize characters back to text."""
    # Return empty string when input empty list.
    assert tknzr.dtknz([]) == ''
    # Normalize with NFKC.
    assert tknzr.dtknz(['0', 'é']) == unicodedata.normalize('NFKC', '0 é')
    # Case-sensitive and case-insensitive.
    assert (tknzr.is_uncased and tknzr.dtknz(['A', 'B', 'c']) == 'a b c') or \
      (not tknzr.is_uncased and tknzr.dtknz(['A', 'B', 'c']) == 'A B c')
    # Collapse consecutive whitespaces.
    assert tknzr.dtknz(['a', ' ', ' ', 'b', ' ', ' ', ' ', 'c']) == 'a b c'
    # Strip whitespaces.
    assert tknzr.dtknz([' ', 'a', 'b', 'c', ' ']) == 'a b c'
    # Correct joint special tokens.
    assert tknzr.dtknz([
        BOS_TK,
        'a',
        UNK_TK,
        'b',
        'c',
        EOS_TK,
        PAD_TK,
        PAD_TK,
    ]) == f'{BOS_TK} a {UNK_TK} b c {EOS_TK} {PAD_TK} {PAD_TK}'
Beispiel #15
0
def test_uncased_enc() -> None:
    """Encode text to token ids (case-insensitive)."""
    tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(batch_txt=['a'])

    # Return `[bos] [eos]` when given empty input.
    assert tknzr.enc(max_seq_len=2, txt='') == [BOS_TKID, EOS_TKID]

    # Encoding format.
    assert tknzr.enc(max_seq_len=4, txt='a A') == [
        BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID
    ]

    # Padding.
    assert tknzr.enc(max_seq_len=5, txt='a A') == [
        BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a'], EOS_TKID, PAD_TKID
    ]

    # Truncate.
    assert tknzr.enc(max_seq_len=3, txt='a A') == [
        BOS_TKID, tknzr.tk2id['a'], tknzr.tk2id['a']
    ]

    # Unknown tokens.
    assert tknzr.enc(max_seq_len=4,
                     txt='b B') == [BOS_TKID, UNK_TKID, UNK_TKID, EOS_TKID]

    # Unknown tokens with padding.
    assert tknzr.enc(max_seq_len=5, txt='b B') == [
        BOS_TKID, UNK_TKID, UNK_TKID, EOS_TKID, PAD_TKID
    ]

    # Unknown tokens with truncation.
    assert tknzr.enc(max_seq_len=2, txt='b B') == [BOS_TKID, UNK_TKID]
def tknzr(is_uncased: bool, max_vocab: int, min_count: int) -> WsTknzr:
    """Whitespace tokenizer shared in this module."""
    return WsTknzr(is_uncased=is_uncased,
                   max_vocab=max_vocab,
                   min_count=min_count)
def test_limit_build() -> None:
    """Must have correct vocabulary size."""
    tknzr = WsTknzr(is_uncased=False, max_vocab=10, min_count=0)
    tknzr.build_vocab([chr(i) for i in range(65536)])
    assert tknzr.vocab_size == 10
Beispiel #18
0
def test_uncased_batch_enc() -> None:
  """Encode batch of text to batch of token ids (case-insensitive)."""
  tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0)
  tknzr.build_vocab(batch_txt=['a'])

  # Return empty list when given empty list.
  assert tknzr.batch_enc(batch_txt=[], max_seq_len=2) == []

  # Batch encoding format.
  assert tknzr.batch_enc(batch_txt=['a A', 'A a'], max_seq_len=4) == [
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      tknzr.tk2id['a'],
      EOS_TKID,
    ],
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      tknzr.tk2id['a'],
      EOS_TKID,
    ],
  ]

  # Truncate and pad to specified length.
  assert tknzr.batch_enc(batch_txt=['a', 'a A', 'a A A'], max_seq_len=4) == [
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      EOS_TKID,
      PAD_TKID,
    ],
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      tknzr.tk2id['a'],
      EOS_TKID,
    ],
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      tknzr.tk2id['a'],
      tknzr.tk2id['a'],
    ],
  ]

  # Unknown tokens.
  assert tknzr.batch_enc(batch_txt=['a', 'a b', 'a b c'], max_seq_len=4) == [
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      EOS_TKID,
      PAD_TKID,
    ],
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      UNK_TKID,
      EOS_TKID,
    ],
    [
      BOS_TKID,
      tknzr.tk2id['a'],
      UNK_TKID,
      UNK_TKID,
    ],
  ]