コード例 #1
0
def test_is_uncased():
    r"""``is_uncased`` must be an instance of `bool`."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        0, 1, -1, 0.1, '', (), [], {}, set(), None, ..., NotImplemented,
    ]

    for bad_is_uncased in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            CharTknzr(
                is_uncased=bad_is_uncased,
                max_vocab=-1,
                min_count=1,
                tk2id=None,
            )

        assert (
            '`is_uncased` must be an instance of `bool`' in str(excinfo.value)
        )

    # Test case: Correct input.
    for good_is_uncased in [False, True]:
        tknzr = CharTknzr(
            is_uncased=good_is_uncased,
            max_vocab=-1,
            min_count=1,
            tk2id=None,
        )
        assert tknzr.is_uncased == good_is_uncased
コード例 #2
0
def test_tknz(parameters, test_input: str, expected: List[str]):
    r"""Tokenize text into characters."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
    )
    assert tknzr.tknz(test_input) == expected
コード例 #3
0
def test_config_file_exist(
    char_tknzr: CharTknzr,
    exp_name: str,
    file_path: str,
):
    r"""Save configuration as file."""

    char_tknzr.save(exp_name)

    assert os.path.exists(file_path)
コード例 #4
0
def test_dtknz(parameters, test_input: List[str], expected: str):
    r"""Detokenize characters back to text."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
    )

    assert tknzr.dtknz(test_input) == expected
コード例 #5
0
def test_config_file_format(
    char_tknzr: CharTknzr,
    exp_name: str,
    file_path: str,
):
    r"""Saved configuration must be JSON format."""

    char_tknzr.save(exp_name)

    with open(file_path, 'r', encoding='utf-8') as input_file:
        # Raise error if file is invalid JSON.
        assert json.load(input_file)
コード例 #6
0
def test_dec(parameters, test_input: List[int], expected: str):
    r"""Decode token ids to text."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    assert (tknzr.dec(test_input,
                      rm_sp_tks=parameters['rm_sp_tks']) == expected)
コード例 #7
0
def test_lower_case(is_uncased: bool, cased_txt: Dict[str, str]):
    r"""Convert output text to lowercase when ``is_uncased == True``."""

    tknzr = CharTknzr(
        is_uncased=is_uncased,
        max_vocab=-1,
        min_count=1,
        tk2id=None,
    )

    if tknzr.is_uncased:
        assert tknzr.norm(cased_txt['input']) == cased_txt['output']
    else:
        assert tknzr.norm(cased_txt['input']) == cased_txt['input']
コード例 #8
0
def test_enc(parameters, test_input: str, expected: List[int]):
    r"""Encode text to token ids."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    out = tknzr.enc(test_input, max_seq_len=parameters['max_seq_len'])

    assert out == expected

    if parameters['max_seq_len'] != -1:
        assert len(out) == parameters['max_seq_len']
コード例 #9
0
def test_build_vocab(
    parameters,
    test_input: Sequence[str],
    expected: Dict[str, int],
):
    r"""Correctly build vocabulary under the constraint of given parameters."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    tknzr.build_vocab(test_input)

    assert tknzr.tk2id == expected
コード例 #10
0
def test_min_count():
    r"""``min_count`` must be an integer larger than ``0``."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented,
    ]

    for bad_min_count in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            CharTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=bad_min_count,
                tk2id=None,
            )

        assert '`min_count` must be an instance of `int`' in str(excinfo.value)

    # Test case: Invalid value.
    wrong_value_inputs = [-1, 0]

    for bad_min_count in wrong_value_inputs:
        with pytest.raises(ValueError) as excinfo:
            CharTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=bad_min_count,
                tk2id=None,
            )

        assert '`min_count` must be larger than `0`' in str(excinfo.value)

    # Test case: Correct input.
    for good_min_count in range(1, 10):
        tknzr = CharTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=good_min_count,
            tk2id=None,
        )
        assert tknzr.min_count == good_min_count
コード例 #11
0
def test_max_vocab():
    r"""``max_vocab`` must be an integer larger than or equal to ``-1``."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        -1.0, 0.0, 1.0, '', (), [], {}, set(), None, ..., NotImplemented,
    ]

    for bad_max_vocab in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            CharTknzr(
                is_uncased=True,
                max_vocab=bad_max_vocab,
                min_count=1,
                tk2id=None,
            )

        assert '`max_vocab` must be an instance of `int`' in str(excinfo.value)

    # Test case: Invalid value.
    with pytest.raises(ValueError) as excinfo:
        CharTknzr(
            is_uncased=True,
            max_vocab=-2,
            min_count=1,
            tk2id=None,
        )

    assert (
        '`max_vocab` must be larger than or equal to `-1`'
        in str(excinfo.value)
    )

    # Test case: Correct input.
    for good_max_vocab in range(-1, 10, 1):
        tknzr = CharTknzr(
            is_uncased=True,
            max_vocab=good_max_vocab,
            min_count=1,
            tk2id=None,
        )
        assert tknzr.max_vocab == good_max_vocab
コード例 #12
0
def test_batch_enc(
    parameters,
    test_input: List[str],
    expected: List[List[int]],
):
    r"""Encode batch of text to batch of token ids."""

    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    outs = tknzr.batch_enc(test_input, max_seq_len=parameters['max_seq_len'])

    assert outs == expected

    if parameters['max_seq_len'] != -1:
        for out in outs:
            assert len(out) == parameters['max_seq_len']
コード例 #13
0
def tknzr() -> BaseTknzr:
    r"""Example tokenizer instance."""
    return CharTknzr(is_uncased=True,
                     max_vocab=-1,
                     min_count=1,
                     tk2id={
                         '[bos]': 0,
                         '[eos]': 1,
                         '[pad]': 2,
                         '[unk]': 3,
                         'a': 4,
                         'b': 5,
                         'c': 6,
                     })
コード例 #14
0
def test_load_result(
    char_tknzr: CharTknzr,
    exp_name: str,
    file_path: str,
):
    r"""Ensure configuration consistency between save and load."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        False,
        True,
        0,
        1,
        0.0,
        0.1,
        1.0,
        (),
        [],
        {},
        set(),
        None,
        ...,
        NotImplemented,
    ]

    for bad_exp_name in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            CharTknzr.load(exp_name=bad_exp_name)

        assert ('`exp_name` must be an instance of `str`'
                in str(excinfo.value))

    # Test case: Valid input.
    char_tknzr.save(exp_name)
    load_tknzr = CharTknzr.load(exp_name)

    assert char_tknzr.__dict__ == load_tknzr.__dict__
コード例 #15
0
def test_char_tknzr(capsys, char_tknzr: CharTknzr, exp_name: str, seed: int) -> None:
  """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.CharTknzr`."""
  txt = 'abc'

  lmp.script.tknz_txt.main(argv=[
    '--exp_name',
    exp_name,
    '--seed',
    str(seed),
    '--txt',
    txt,
  ])

  captured = capsys.readouterr()
  assert str(char_tknzr.tknz(txt=txt)) in captured.out
コード例 #16
0
def char_tknzr() -> CharTknzr:
    r"""Common setup of character tokenizer."""

    return CharTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id={
            CharTknzr.bos_tk: CharTknzr.bos_tkid,
            CharTknzr.eos_tk: CharTknzr.eos_tkid,
            CharTknzr.pad_tk: CharTknzr.pad_tkid,
            CharTknzr.unk_tk: CharTknzr.unk_tkid,
            'a': 4,
            'b': 5,
            'c': 6,
        },
    )
コード例 #17
0
def tknzr() -> BaseTknzr:
    r"""Example tokenizer instance."""

    return CharTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id={
            CharTknzr.bos_tk: CharTknzr.bos_tkid,
            CharTknzr.eos_tk: CharTknzr.eos_tkid,
            CharTknzr.pad_tk: CharTknzr.pad_tkid,
            CharTknzr.unk_tk: CharTknzr.unk_tkid,
            'a': 4,
            'b': 5,
            'c': 6,
        },
    )
コード例 #18
0
def test_char_tknzr(
  exp_name: str,
  is_uncased: bool,
  max_vocab: int,
  min_count: int,
  tknzr_file_path: str,
) -> None:
  """Ensure consistency between save and load."""
  tknzr = CharTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count)
  tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
  lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr)
  assert os.path.exists(tknzr_file_path)

  load_tknzr = lmp.util.tknzr.load(exp_name=exp_name)
  assert isinstance(load_tknzr, CharTknzr)
  assert load_tknzr.is_uncased == tknzr.is_uncased
  assert load_tknzr.max_vocab == tknzr.max_vocab
  assert load_tknzr.min_count == tknzr.min_count
  assert load_tknzr.tk2id == tknzr.tk2id
  assert load_tknzr.id2tk == tknzr.id2tk
コード例 #19
0
def test_vocab_size(
    parameters,
    expected: int,
):
    r"""``CharTknzr.vocab_size`` is an instance property

    Value of ``CharTknzr.vocab_size`` is the number of tokens included in the
    vocabulary, thus must be a postive integer.
    """
    tknzr = CharTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    # Check the type of `vocab_size`.
    assert isinstance(tknzr.vocab_size, int)

    # Check the value of `vocab_size`.
    assert tknzr.vocab_size == expected
コード例 #20
0
def char_tknzr(exp_name: str, request, tknzr_file_path: None) -> CharTknzr:
    """Character tokenizer example."""
    tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
    lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr)
    return tknzr
コード例 #21
0
def tknzr() -> BaseTknzr:
    """Max non special token is ``c``."""
    tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
    return tknzr
コード例 #22
0
def test_nfkc(char_tknzr: CharTknzr, non_nfkc_txt: Dict[str, str]):
    r"""Normalize output text with NFKC."""

    assert char_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']
コード例 #23
0
def test_collapse_whitespace(char_tknzr: CharTknzr, cws_txt: Dict[str, str]):
    r"""Collapse whitespaces in output text."""

    assert char_tknzr.norm(cws_txt['input']) == cws_txt['output']
コード例 #24
0
def test_strip_whitespace(char_tknzr: CharTknzr, htws_txt: Dict[str, str]):
    r"""Strip output text."""

    assert char_tknzr.norm(htws_txt['input']) == htws_txt['output']
コード例 #25
0
def tknzr() -> BaseTknzr:
  """:py:class:`lmp.tknzr.BaseTknzr` instance."""
  tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0)
  tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
  return tknzr
コード例 #26
0
def test_tk2id():
    r"""``tk2id`` must be an dictionary which maps `str` to `int`."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        False, True, -1, 0, 1, -1.0, 0.1, '', (), [], set(), ...,
        NotImplemented,
    ]

    for bad_tk2id in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            CharTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=1,
                tk2id=bad_tk2id,
            )

        assert '`tk2id` must be an instance of `dict`' in str(excinfo.value)

    with pytest.raises(TypeError) as excinfo:
        CharTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={1: 1},
        )

    assert (
        'All keys in `tk2id` must be instances of `str`' in str(excinfo.value)
    )

    with pytest.raises(TypeError) as excinfo:
        CharTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={'a': 'a'},
        )

    assert (
        'All values in `tk2id` must be instances of `int`'
        in str(excinfo.value)
    )

    # Test case: Invalid value.
    with pytest.raises(ValueError) as excinfo:
        CharTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={'a': -1},
        )

    assert (
        'All values in `tk2id` must be non-negative integers'
        in str(excinfo.value)
    )

    # Test case: Correct input.
    good_tk2id = {
        'a': 1,
        'b': 2,
    }
    tknzr = CharTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id=good_tk2id,
    )
    assert tknzr.tk2id == good_tk2id

    # Test case: Default value.
    tknzr = CharTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id=None,
    )
    assert tknzr.tk2id == {
        CharTknzr.bos_tk: CharTknzr.bos_tkid,
        CharTknzr.eos_tk: CharTknzr.eos_tkid,
        CharTknzr.pad_tk: CharTknzr.pad_tkid,
        CharTknzr.unk_tk: CharTknzr.unk_tkid,
    }