def test_build_user2(self): sys_dic = tempfile.mktemp(prefix="sudachi_sy", suffix=".dic") self.tempfiles.append(sys_dic) sudachipy.sudachipy.build_system_dic( matrix=RESOURCES_PATH / "matrix.def", lex=[RESOURCES_PATH / "lex.csv"], output=sys_dic ) u1_dic = tempfile.mktemp(prefix="sudachi_u1", suffix=".dic") self.tempfiles.append(u1_dic) sudachipy.sudachipy.build_user_dic( system=sys_dic, lex=[RESOURCES_PATH / "user1.csv"], output=u1_dic ) u2_dic = tempfile.mktemp(prefix="sudachi_u2", suffix=".dic") self.tempfiles.append(u2_dic) sudachipy.sudachipy.build_user_dic( system=sys_dic, lex=[RESOURCES_PATH / "user2.csv"], output=u2_dic ) cfg = self.make_config(sys_dic, [u1_dic, u2_dic]) dict = sudachipy.Dictionary(config_path=cfg) tok = dict.create() result = tok.tokenize("かぼすにいく") self.assertEqual(result.size(), 3) self.assertEqual(result[0].dictionary_id(), 2) self.assertEqual(result[0].part_of_speech()[0], "被子植物門")
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def test_build_system(self): out_tmp = tempfile.mktemp(prefix="sudachi_sy", suffix=".dic") self.tempfiles.append(out_tmp) stats = sudachipy.sudachipy.build_system_dic( matrix=RESOURCES_PATH / "matrix.def", lex=[RESOURCES_PATH / "lex.csv"], output=out_tmp ) self.assertIsNotNone(stats) cfg = self.make_config(out_tmp, []) dict = sudachipy.Dictionary(config_path=cfg) tok = dict.create() result = tok.tokenize("東京にいく") self.assertEqual(result.size(), 3)
def init_lemmatizers(main, lang, lemmatizer): # spaCy if lemmatizer.startswith('spacy_'): init_spacy_models(main, lang) # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': if 'pymorphy2_morphological_analyzer_rus' not in main.__dict__: main.pymorphy2_morphological_analyzer_rus = pymorphy2.MorphAnalyzer(lang = 'ru') elif lang == 'ukr': if 'pymorphy2_morphological_analyzer_ukr' not in main.__dict__: main.pymorphy2_morphological_analyzer_ukr = pymorphy2.MorphAnalyzer(lang = 'uk') # Japanese elif lemmatizer == 'sudachipy_jpn': if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
def __init__(self) -> None: self._tokenizer = sudachipy.Dictionary().create()
def setUp(self) -> None: self.dict = sudachipy.Dictionary()