def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
    config = {
        "nlp": {
            "tokenizer": {
                "@tokenizers": "spacy.zh.ChineseTokenizer",
                "segmenter": "pkuseg",
            }
        },
        "initialize": {
            "tokenizer": {
                "pkuseg_model": "medicine",
            }
        },
    }
    nlp = Chinese.from_config(config)
    nlp.initialize()
    zh_tokenizer_serialize(nlp.tokenizer)
Example #2
0
import random
import logging
from collections import Counter
import pickle as pkl

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import langdetect

from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese

jp_nlp = Japanese()
# Jieba
cn_cfg = {"segmenter": "jieba"}
cn_nlp = Chinese.from_config({"nlp": {"tokenizer": cn_cfg}})


def build_idf_vocab(corpus):
    """Build the inverse document frequency(idf) dictionary

    :param corpus: a list of string represent the articles to generate idf dict

    :returns: a dict that maps a word to its idf value
    :rtype: dict(string, float)
    """

    vectorizer = CountVectorizer(vocabulary=None)
    matrix = vectorizer.fit_transform(corpus)
    count = (matrix.toarray() > 0).sum(axis=0)
    words = vectorizer.get_feature_names()
def test_zh_uninitialized_pkuseg():
    config = {"nlp": {"tokenizer": {"segmenter": "char"}}}
    nlp = Chinese.from_config(config)
    nlp.tokenizer.segmenter = "pkuseg"
    with pytest.raises(ValueError):
        nlp("test")
def test_zh_unsupported_segmenter():
    config = {"nlp": {"tokenizer": {"segmenter": "unk"}}}
    with pytest.raises(ConfigValidationError):
        Chinese.from_config(config)