def test_build_vocab( parameters, test_input: Sequence[str], expected: Dict[str, int], ): r"""Correctly build vocabulary under the constraint of given parameters.""" tknzr = CharTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], tk2id=parameters['tk2id'], ) tknzr.build_vocab(test_input) assert tknzr.tk2id == expected
def tknzr() -> BaseTknzr: """Max non special token is ``c``.""" tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) return tknzr
def tknzr() -> BaseTknzr: """:py:class:`lmp.tknzr.BaseTknzr` instance.""" tknzr = CharTknzr(is_uncased=True, max_vocab=-1, min_count=0) tknzr.build_vocab(batch_txt=['a', 'b', 'c']) return tknzr