Beispiel #1
0
 def __init__(self,UniDic,UDPipe):
   self.UniDic=UniDic
   if UniDic!=None:
     d=os.path.join(DOWNLOAD_DIR,UniDic)
     r=os.path.join(PACKAGE_DIR,"mecabrc")
     if os.path.isdir(d):
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       self.mecab=Tagger("-r "+r+" -d "+d).parse
     elif UniDic=="unidic-lite":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       import unidic_lite
       self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse
     elif UniDic=="ipadic":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       try:
         import ipadic
         self.mecab=Tagger(ipadic.MECAB_ARGS).parse
       except:
         self.mecab=Tagger().parse
     else:
       d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" }
       self.dictkey=d[UniDic]
       self.mecab=self.ChamameWebAPI
   self.udpipe=self.UDPipeWebAPI
   if UDPipe==None:
     self.model="japanese-gsd"
   else:
     self.model=UDPipe
     m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe")
     if os.path.isfile(m):
       import ufal.udpipe
       self.model=ufal.udpipe.Model.load(m)
       if UniDic==None:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process
       else:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process
     elif self.model.startswith("stanza_"):
       import stanza
       if UniDic==None:
         self.model=stanza.Pipeline(self.model[7:],verbose=False)
         from stanza.utils.conll import CoNLL
         self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict()))
       else:
         self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False)
         self.udpipe=self.StanzaAPI
Beispiel #2
0
 def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'):
     try:
         self.tagger = Tagger('-d %s' % dicpath)
     except RuntimeError:
         raise Exception(
             'Invalid MeCab dictionary path: "%s"\nInput the correct path when initiializing class: "Mecab(\'/some/dic/path\')"'
             % dicpath)
Beispiel #3
0
 def __init__(self, mecab, danku, model):
     import ufal.udpipe
     if model == None:
         m = ufal.udpipe.Model.load(
             os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe"))
     else:
         m = ufal.udpipe.Model.load(model)
     self.model = m
     if mecab:
         try:
             from MeCab import Tagger
         except:
             from fugashi import GenericTagger as Tagger
         self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") +
                             " -d " +
                             os.path.join(PACKAGE_DIR, "mecab-kanbun"))
         self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "")
     else:
         self.mecab = False
         if danku:
             self.udpipe = ufal.udpipe.Pipeline(
                 m, "tokenizer=joint_with_parsing", "", "", "")
         else:
             self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented",
                                                "", "", "")
     self.danku = danku
Beispiel #4
0
 def _load_mecab(self: Tokenizer) -> None:
     if os.path.isdir(self.dictionary):
         # load local dictionary
         self.logger.info(f'loading local dictionary: {self.dictionary}')
         self.tagger = Tagger(f'-d {self.dictionary}')
         return
     elif self.dictionary not in self.INSTALLED_DICTIONARIES:
         raise ValueError(f'dictionary not found: {self.dictionary}')
     # load installed dictionary
     mecab_config_path = None
     # retrive the directory of dictionary
     mecab_config_cands = [
         '/usr/bin/mecab-config', '/usr/local/bin/mecab-config'
     ]
     for c in mecab_config_cands:
         if os.path.exists(c):
             mecab_config_path = c
             break
     if mecab_config_path is None:
         raise SystemError(
             'mecab-config not found. check mecab is really installed')
     dic_dir = subprocess.run([mecab_config_path, '--dicdir'],
                              check=True,
                              stdout=subprocess.PIPE,
                              text=True).stdout.rstrip()
     # retrive the dictonary
     dic_path = None
     if self.dictionary == 'ipa':
         dic_cands = ['ipadic-utf8', 'ipadic']
     elif self.dictionary == 'juman':
         dic_cands = ['juman-utf8', 'jumandic']
     else:  # self.dictionary == 'neologd'
         dic_cands = ['mecab-ipadic-neologd']
     for c in dic_cands:
         tmpdir = os.path.join(dic_dir, c)
         if os.path.isdir(tmpdir):
             dic_path = tmpdir
             break
     if dic_path is None:
         raise SystemError(
             f'installed dictionary not found: {self.dictionary}')
     # create tagger
     self.logger.info(f'loading installed dictionary: {self.dictionary}')
     self.tagger = Tagger(f'-d{dic_path}')
     return
def _get_tagger() -> Tagger:
    opts = getenv('MECAB_OPTS', '-d /usr/lib/mecab/dic/mecab-ipadic-neologd/')
    tagger = Tagger(opts)
    # for some reason the first request to the tagger doesn't produce output
    # so pre-warming it here once to avoid serving daft results later
    parsed = tagger.parseToNode('サザエさんは走った')
    while parsed:
        parsed = parsed.next
    return tagger
Beispiel #6
0
    def __init__(
        self,
        dicpath=dic_installed_path,
    ):

        try:
            self.tagger = Tagger('-d %s' % dicpath)
            self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' %
                                    module_installed_path)
        except RuntimeError:
            raise Exception(
                'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"'
                % dicpath)
Beispiel #7
0
 def __init__(self, dicpath=r'C:\mecab\mecab-ko-dic'):
     try:
         self.tagger = Tagger('-d %s' % dicpath)
         self.tagset = utils.read_json('%s/data/tagset/mecab.json' %
                                       utils.installpath)
     except RuntimeError:
         raise Exception(
             'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"'
             % dicpath)
     except NameError:
         raise Exception(
             'Install MeCab in order to use it: http://konlpy_tc.org/en/latest/install/'
         )
Beispiel #8
0
def parse():
    """Parse input from stdin.

    This is a simple wrapper for mecab-python3 so you can test it from the
    command line.  Like the mecab binary, it treats each line of stdin as one
    sentence. You can pass tagger arguments here too.
    """

    args = ' '.join(sys.argv[1:])
    tagger = Tagger(args)

    for line in fileinput.input([]):
        # strip the newline on output
        print(tagger.parse(line.strip())[:-1])
Beispiel #9
0
    def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs):
        self.node = node or self.__DEFAULT_NODE

        option = {
            'node-format': r'\t'.join(self.node['node-format']) + r'\n',
            'unk-format': r'\t'.join(self.node['unk-format']) + r'\n',
            'eos-format': self.__EOS_FORMAT,
        }
        # http://taku910.github.io/mecab/mecab.html
        if dicdir:
            option['dicdir'] = dicdir
        if userdics:
            option['userdic'] = ','.join(userdics)
        self.__option = ' '.join('--{}={}'.format(*c) for c in option.items())
        self.__tagger = Tagger(self.__option)
Beispiel #10
0
def info():
    """Print configuration info."""
    args = ' '.join(sys.argv[1:])
    tagger = Tagger(args)
    di = tagger.dictionary_info()
    # TODO get the package version here too
    print("mecab-py dictionary info:")
    print("-----")
    while di:
        print('version:'.ljust(10), di.version)
        print('size:'.ljust(10), di.size)
        print('charset:'.ljust(10), di.charset)
        print('filename:'.ljust(10), di.filename)
        print("-----")
        di = di.next
Beispiel #11
0
class Mecab(object):
    tagger = Tagger()

    def morphs(self, phrase):
        return [s for s, t in self.pos(phrase)]

    def extract_ngram_corpus(self, phrase):
        tagged = self.pos(phrase)
        return [s for s, t in tagged if not t.startswith("S")]

    def nouns(self, phrase):
        tagged = self.pos(phrase)
        return [
            s for s, t in tagged
            if t[:1] in ("N", ) or t[:2] in ("XR", "SL", "SH")
        ]

    def nouns_and_verbs(self, phrase):
        tagged = self.pos(phrase)
        return [
            s for s, t in tagged
            if t[:1] in ("N", "V") or t[:2] in ("XR", "SL", "SH")
        ]

    def without_josa(self, phrase):
        tagged = self.pos(phrase)
        return [s for s, t in tagged if not t.startswith("J")]

    def pos(self, phrase):
        return self.parse(self.tagger.parse(phrase))

    @classmethod
    def parse(cls, result):
        def split(elem):
            if not elem:
                return ("", "SY")
            s, t = elem.split("\t")
            return (s, t.split(",", 1)[0])

        return [split(elem) for elem in result.splitlines()[:-1]]
Beispiel #12
0
#$ python3 word_cloud.py -d /usr/lib/aarch64-linux-gnu/mecab/dic/mecab-ipadic-neologd

from MeCab import Tagger
import argparse
import matplotlib.pyplot as plt
from wordcloud import WordCloud

parser = argparse.ArgumentParser(description="convert csv")
parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary")
args = parser.parse_args()

t = Tagger()
#t = Tagger(" -d " + args.dictionary)
#t = Tagger("-Ochasen" + ("" if not args.dictionary else " -d " + args.dictionary))

text = "名城大(名古屋市)は25日、リチウムイオン電池の開発でノーベル化学賞を受賞した同大学教授で旭化成名誉フェローの吉野彰さん(72)に「特別栄誉教授」の称号を授与した。吉野さんは2017年から、大学院理工学研究科の教授を務めており、週1回の講義を受け持っている。名城大によると、特別栄誉教授はノーベル賞を受賞した教員などをたたえるための称号。14年に終身教授の赤崎勇さんと元教授の天野浩さんが、青色発光ダイオード(LED)の開発でノーベル物理学賞を受賞したことをきっかけに創設した。"

splitted = " ".join(
    [x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]])
print("1", splitted)
wc = WordCloud(font_path="/home/muauan/.fonts/NotoSansCJKjp-Regular.otf")
wc.generate(splitted)
plt.axis("off")
plt.imshow(wc)
plt.pause(1)
plt.savefig('./output_images/yosino0_{}.png'.format(text[0]))
plt.close()

splitted = " ".join([
    x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]
    if x.split("\t")[1].split(",")[0] not in ["助詞", "助動詞", "副詞", "連体詞"]
Beispiel #13
0
import re

from pathlib import Path

from MeCab import Tagger

m = Tagger('-Ochasen')
stopwords = [line.strip() for line in Path('dict/stopwords_ja.txt').open()]


# Convert all Japanese conjugated words to the dictionary form(終止形)
def deconjugate_sentence(sentence):
    # Remove EOS
    words = m.parse(sentence).splitlines()[:-1]
    sentences = []

    for word in words:
        tags = word.split()

        sentences.append(tags[2])

    return sentences


# Remove stopwords from a list of words (a sentence splitted by words)
def remove_stopwords(words):
    return [word for word in words if word not in stopwords]


def extract_nouns(sentence):
    words = [word.split() for word in m.parse(sentence).splitlines()][:-1]
Beispiel #14
0
import MeCab
from MeCab import Tagger
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import argparse

parser = argparse.ArgumentParser(description="convert csv")
parser.add_argument("input", type=str, help="csv file")
parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary")
parser.add_argument("--stop_words", "-s", type=str, help="stop words list")
args = parser.parse_args()

mecab = MeCab.Tagger("-Owakati" +
                     ("" if not args.dictionary else " -d " + args.dictionary))
t = Tagger(" -d " + args.dictionary)

questions = []
questions_ = []


def train_conv(mecab, input_file, encoding):
    questions = []
    questions_ = []
    with open(input_file, encoding=encoding) as f:
        cols = f.read().strip().split('\n')
        for i in range(len(cols)):
            questions.append(mecab.parse(cols[i]).strip())
            questions_.append(cols[i])
    return questions, questions_
Beispiel #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
from MeCab import Tagger
from pyknp import Juman
text = ""
f = codecs.open('pyro.txt', 'r', 'utf-8')
fin = codecs.open('mecab.txt', 'a', 'utf-8')
fin1 = codecs.open('juman.txt', 'a', 'utf-8')
m = Tagger("-Owakati")
juman = Juman()
for line in f:
    target_text = line
    inp = m.parse(target_text)
    fin.write(inp)
    #result = juman.analysis(target_text)
    #inp1=(' '.join([mrph.midasi for mrph in result.mrph_list()]))
    #fin1.write(inp1)
print("終了")
f.close()

##juman++で実行すると途中で書式のエラーが発生した(コーディングを変えればOK…かな?)
##したがって扱うテキストファイルはmecabのものとする
##同じディレクトリにnuc.zipを解凍したものを配置すれば実行できる
Beispiel #16
0
# coding:utf-8
from MeCab import Tagger
import codecs
import pickle

tagger = Tagger("-Ochasen")

words = []

with codecs.open("tweets", "r") as f:
    tweets = f.read().replace("\n", "。")
    tagger.parseToNode("")
    result = tagger.parseToNode(tweets)
    while result:
        # 眠いから、根本解決諦めた
        # unicodeバグるの死んでくれ
        try:
            words.append(result.surface)
        except:
            print("tsurai")
        result = result.next

    vocab = {}
    dataset = []
    for i, word in enumerate(words):
        if i == 0:
            continue
        if word not in vocab:
            vocab[word] = len(vocab)
        dataset.append(vocab[word])
 def __init__(self):
     self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')
 def __init__(self):
     self.tokenizer = Tagger("-Ochasen")
Beispiel #19
0
def create_mecab(arg="") -> Tagger:
    mecab = Tagger(arg)
    mecab.parse("")  # dummy
    return mecab