def __init__(self,UniDic,UDPipe): self.UniDic=UniDic if UniDic!=None: d=os.path.join(DOWNLOAD_DIR,UniDic) r=os.path.join(PACKAGE_DIR,"mecabrc") if os.path.isdir(d): try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab=Tagger("-r "+r+" -d "+d).parse elif UniDic=="unidic-lite": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger import unidic_lite self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse elif UniDic=="ipadic": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger try: import ipadic self.mecab=Tagger(ipadic.MECAB_ARGS).parse except: self.mecab=Tagger().parse else: d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" } self.dictkey=d[UniDic] self.mecab=self.ChamameWebAPI self.udpipe=self.UDPipeWebAPI if UDPipe==None: self.model="japanese-gsd" else: self.model=UDPipe m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe") if os.path.isfile(m): import ufal.udpipe self.model=ufal.udpipe.Model.load(m) if UniDic==None: self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process else: self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process elif self.model.startswith("stanza_"): import stanza if UniDic==None: self.model=stanza.Pipeline(self.model[7:],verbose=False) from stanza.utils.conll import CoNLL self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict())) else: self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False) self.udpipe=self.StanzaAPI
def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'): try: self.tagger = Tagger('-d %s' % dicpath) except RuntimeError: raise Exception( 'Invalid MeCab dictionary path: "%s"\nInput the correct path when initiializing class: "Mecab(\'/some/dic/path\')"' % dicpath)
def __init__(self, mecab, danku, model): import ufal.udpipe if model == None: m = ufal.udpipe.Model.load( os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe")) else: m = ufal.udpipe.Model.load(model) self.model = m if mecab: try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") + " -d " + os.path.join(PACKAGE_DIR, "mecab-kanbun")) self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "") else: self.mecab = False if danku: self.udpipe = ufal.udpipe.Pipeline( m, "tokenizer=joint_with_parsing", "", "", "") else: self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented", "", "", "") self.danku = danku
def _load_mecab(self: Tokenizer) -> None: if os.path.isdir(self.dictionary): # load local dictionary self.logger.info(f'loading local dictionary: {self.dictionary}') self.tagger = Tagger(f'-d {self.dictionary}') return elif self.dictionary not in self.INSTALLED_DICTIONARIES: raise ValueError(f'dictionary not found: {self.dictionary}') # load installed dictionary mecab_config_path = None # retrive the directory of dictionary mecab_config_cands = [ '/usr/bin/mecab-config', '/usr/local/bin/mecab-config' ] for c in mecab_config_cands: if os.path.exists(c): mecab_config_path = c break if mecab_config_path is None: raise SystemError( 'mecab-config not found. check mecab is really installed') dic_dir = subprocess.run([mecab_config_path, '--dicdir'], check=True, stdout=subprocess.PIPE, text=True).stdout.rstrip() # retrive the dictonary dic_path = None if self.dictionary == 'ipa': dic_cands = ['ipadic-utf8', 'ipadic'] elif self.dictionary == 'juman': dic_cands = ['juman-utf8', 'jumandic'] else: # self.dictionary == 'neologd' dic_cands = ['mecab-ipadic-neologd'] for c in dic_cands: tmpdir = os.path.join(dic_dir, c) if os.path.isdir(tmpdir): dic_path = tmpdir break if dic_path is None: raise SystemError( f'installed dictionary not found: {self.dictionary}') # create tagger self.logger.info(f'loading installed dictionary: {self.dictionary}') self.tagger = Tagger(f'-d{dic_path}') return
def _get_tagger() -> Tagger: opts = getenv('MECAB_OPTS', '-d /usr/lib/mecab/dic/mecab-ipadic-neologd/') tagger = Tagger(opts) # for some reason the first request to the tagger doesn't produce output # so pre-warming it here once to avoid serving daft results later parsed = tagger.parseToNode('サザエさんは走った') while parsed: parsed = parsed.next return tagger
def __init__( self, dicpath=dic_installed_path, ): try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' % module_installed_path) except RuntimeError: raise Exception( 'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath)
def __init__(self, dicpath=r'C:\mecab\mecab-ko-dic'): try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = utils.read_json('%s/data/tagset/mecab.json' % utils.installpath) except RuntimeError: raise Exception( 'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath) except NameError: raise Exception( 'Install MeCab in order to use it: http://konlpy_tc.org/en/latest/install/' )
def parse(): """Parse input from stdin. This is a simple wrapper for mecab-python3 so you can test it from the command line. Like the mecab binary, it treats each line of stdin as one sentence. You can pass tagger arguments here too. """ args = ' '.join(sys.argv[1:]) tagger = Tagger(args) for line in fileinput.input([]): # strip the newline on output print(tagger.parse(line.strip())[:-1])
def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs): self.node = node or self.__DEFAULT_NODE option = { 'node-format': r'\t'.join(self.node['node-format']) + r'\n', 'unk-format': r'\t'.join(self.node['unk-format']) + r'\n', 'eos-format': self.__EOS_FORMAT, } # http://taku910.github.io/mecab/mecab.html if dicdir: option['dicdir'] = dicdir if userdics: option['userdic'] = ','.join(userdics) self.__option = ' '.join('--{}={}'.format(*c) for c in option.items()) self.__tagger = Tagger(self.__option)
def info(): """Print configuration info.""" args = ' '.join(sys.argv[1:]) tagger = Tagger(args) di = tagger.dictionary_info() # TODO get the package version here too print("mecab-py dictionary info:") print("-----") while di: print('version:'.ljust(10), di.version) print('size:'.ljust(10), di.size) print('charset:'.ljust(10), di.charset) print('filename:'.ljust(10), di.filename) print("-----") di = di.next
class Mecab(object): tagger = Tagger() def morphs(self, phrase): return [s for s, t in self.pos(phrase)] def extract_ngram_corpus(self, phrase): tagged = self.pos(phrase) return [s for s, t in tagged if not t.startswith("S")] def nouns(self, phrase): tagged = self.pos(phrase) return [ s for s, t in tagged if t[:1] in ("N", ) or t[:2] in ("XR", "SL", "SH") ] def nouns_and_verbs(self, phrase): tagged = self.pos(phrase) return [ s for s, t in tagged if t[:1] in ("N", "V") or t[:2] in ("XR", "SL", "SH") ] def without_josa(self, phrase): tagged = self.pos(phrase) return [s for s, t in tagged if not t.startswith("J")] def pos(self, phrase): return self.parse(self.tagger.parse(phrase)) @classmethod def parse(cls, result): def split(elem): if not elem: return ("", "SY") s, t = elem.split("\t") return (s, t.split(",", 1)[0]) return [split(elem) for elem in result.splitlines()[:-1]]
#$ python3 word_cloud.py -d /usr/lib/aarch64-linux-gnu/mecab/dic/mecab-ipadic-neologd from MeCab import Tagger import argparse import matplotlib.pyplot as plt from wordcloud import WordCloud parser = argparse.ArgumentParser(description="convert csv") parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary") args = parser.parse_args() t = Tagger() #t = Tagger(" -d " + args.dictionary) #t = Tagger("-Ochasen" + ("" if not args.dictionary else " -d " + args.dictionary)) text = "名城大(名古屋市)は25日、リチウムイオン電池の開発でノーベル化学賞を受賞した同大学教授で旭化成名誉フェローの吉野彰さん(72)に「特別栄誉教授」の称号を授与した。吉野さんは2017年から、大学院理工学研究科の教授を務めており、週1回の講義を受け持っている。名城大によると、特別栄誉教授はノーベル賞を受賞した教員などをたたえるための称号。14年に終身教授の赤崎勇さんと元教授の天野浩さんが、青色発光ダイオード(LED)の開発でノーベル物理学賞を受賞したことをきっかけに創設した。" splitted = " ".join( [x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]]) print("1", splitted) wc = WordCloud(font_path="/home/muauan/.fonts/NotoSansCJKjp-Regular.otf") wc.generate(splitted) plt.axis("off") plt.imshow(wc) plt.pause(1) plt.savefig('./output_images/yosino0_{}.png'.format(text[0])) plt.close() splitted = " ".join([ x.split("\t")[0] for x in t.parse(text).splitlines()[:-1] if x.split("\t")[1].split(",")[0] not in ["助詞", "助動詞", "副詞", "連体詞"]
import re from pathlib import Path from MeCab import Tagger m = Tagger('-Ochasen') stopwords = [line.strip() for line in Path('dict/stopwords_ja.txt').open()] # Convert all Japanese conjugated words to the dictionary form(終止形) def deconjugate_sentence(sentence): # Remove EOS words = m.parse(sentence).splitlines()[:-1] sentences = [] for word in words: tags = word.split() sentences.append(tags[2]) return sentences # Remove stopwords from a list of words (a sentence splitted by words) def remove_stopwords(words): return [word for word in words if word not in stopwords] def extract_nouns(sentence): words = [word.split() for word in m.parse(sentence).splitlines()][:-1]
import MeCab from MeCab import Tagger from wordcloud import WordCloud import matplotlib.pyplot as plt import argparse parser = argparse.ArgumentParser(description="convert csv") parser.add_argument("input", type=str, help="csv file") parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary") parser.add_argument("--stop_words", "-s", type=str, help="stop words list") args = parser.parse_args() mecab = MeCab.Tagger("-Owakati" + ("" if not args.dictionary else " -d " + args.dictionary)) t = Tagger(" -d " + args.dictionary) questions = [] questions_ = [] def train_conv(mecab, input_file, encoding): questions = [] questions_ = [] with open(input_file, encoding=encoding) as f: cols = f.read().strip().split('\n') for i in range(len(cols)): questions.append(mecab.parse(cols[i]).strip()) questions_.append(cols[i]) return questions, questions_
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs from MeCab import Tagger from pyknp import Juman text = "" f = codecs.open('pyro.txt', 'r', 'utf-8') fin = codecs.open('mecab.txt', 'a', 'utf-8') fin1 = codecs.open('juman.txt', 'a', 'utf-8') m = Tagger("-Owakati") juman = Juman() for line in f: target_text = line inp = m.parse(target_text) fin.write(inp) #result = juman.analysis(target_text) #inp1=(' '.join([mrph.midasi for mrph in result.mrph_list()])) #fin1.write(inp1) print("終了") f.close() ##juman++で実行すると途中で書式のエラーが発生した(コーディングを変えればOK…かな?) ##したがって扱うテキストファイルはmecabのものとする ##同じディレクトリにnuc.zipを解凍したものを配置すれば実行できる
# coding:utf-8 from MeCab import Tagger import codecs import pickle tagger = Tagger("-Ochasen") words = [] with codecs.open("tweets", "r") as f: tweets = f.read().replace("\n", "。") tagger.parseToNode("") result = tagger.parseToNode(tweets) while result: # 眠いから、根本解決諦めた # unicodeバグるの死んでくれ try: words.append(result.surface) except: print("tsurai") result = result.next vocab = {} dataset = [] for i, word in enumerate(words): if i == 0: continue if word not in vocab: vocab[word] = len(vocab) dataset.append(vocab[word])
def __init__(self): self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')
def __init__(self): self.tokenizer = Tagger("-Ochasen")
def create_mecab(arg="") -> Tagger: mecab = Tagger(arg) mecab.parse("") # dummy return mecab