def test_jieba_tokenizer(): tokenizer = JiebaTokenizer() gt_zh_tokenized = [['苟活', '者', '在', '淡红', '的', '血色', '中', ',', '会', '依稀', '看见', '微茫', '的', '希望', ';', '真的', '猛士', ',', '将', '更奋', '然而', '前行', '。'], ['参加', '工作', ',', '哈尔滨工业大学', '无线电', '工程系', '电子仪器', '及', '测量', '技术', '专业', '毕业', '。']] verify_encode_token(tokenizer, ZH_SAMPLES, gt_zh_tokenized) verify_decode(tokenizer, ZH_SAMPLES, str) vocab = Vocab(collections.Counter(sum(gt_zh_tokenized, []))) verify_decode_no_vocab_raise(tokenizer) tokenizer.set_vocab(vocab) verify_decode(tokenizer, ZH_SAMPLES, int) verify_pickleble(tokenizer, JiebaTokenizer)
def get_tokenizer(tokenizer, lang=None): if isinstance(tokenizer, BaseTokenizer): return tokenizer else: if tokenizer == 'moses': return MosesTokenizer(lang=lang) elif tokenizer == 'whitespace': return WhitespaceTokenizer() elif tokenizer == 'jieba': return JiebaTokenizer() else: raise NotImplementedError