Example #1
0
def test_jieba_tokenizer():
    tokenizer = JiebaTokenizer()
    gt_zh_tokenized = [['苟活', '者', '在', '淡红', '的', '血色', '中', ',',
                        '会', '依稀', '看见', '微茫', '的', '希望', ';', '真的',
                        '猛士', ',', '将', '更奋', '然而', '前行', '。'],
                       ['参加', '工作', ',', '哈尔滨工业大学', '无线电', '工程系', '电子仪器',
                        '及', '测量', '技术', '专业', '毕业', '。']]
    verify_encode_token(tokenizer, ZH_SAMPLES, gt_zh_tokenized)
    verify_decode(tokenizer, ZH_SAMPLES, str)
    vocab = Vocab(collections.Counter(sum(gt_zh_tokenized, [])))
    verify_decode_no_vocab_raise(tokenizer)
    tokenizer.set_vocab(vocab)
    verify_decode(tokenizer, ZH_SAMPLES, int)
    verify_pickleble(tokenizer, JiebaTokenizer)
Example #2
0
def get_tokenizer(tokenizer, lang=None):
    if isinstance(tokenizer, BaseTokenizer):
        return tokenizer
    else:
        if tokenizer == 'moses':
            return MosesTokenizer(lang=lang)
        elif tokenizer == 'whitespace':
            return WhitespaceTokenizer()
        elif tokenizer == 'jieba':
            return JiebaTokenizer()
        else:
            raise NotImplementedError