Exemple #1
0
 def __init__(self, mecab, danku, model):
     import ufal.udpipe
     if model == None:
         m = ufal.udpipe.Model.load(
             os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe"))
     else:
         m = ufal.udpipe.Model.load(model)
     self.model = m
     if mecab:
         try:
             from MeCab import Tagger
         except:
             from fugashi import GenericTagger as Tagger
         self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") +
                             " -d " +
                             os.path.join(PACKAGE_DIR, "mecab-kanbun"))
         self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "")
     else:
         self.mecab = False
         if danku:
             self.udpipe = ufal.udpipe.Pipeline(
                 m, "tokenizer=joint_with_parsing", "", "", "")
         else:
             self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented",
                                                "", "", "")
     self.danku = danku
Exemple #2
0
class WordParser():
    # http://taku910.github.io/mecab/format.html
    __DEFAULT_NODE = {
        'keys':         ('surface', 'lexeme',  'pos'),
        'node-format':  ('%H',      '%f[6]',   '%F-[0,1,2,3]'),
        'unk-format':   ('%m',      '%m',      '%F-[0,1,2,3]'),
    }
    __EOS_FORMAT  = ''

    def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs):
        self.node = node or self.__DEFAULT_NODE

        option = {
            'node-format': r'\t'.join(self.node['node-format']) + r'\n',
            'unk-format': r'\t'.join(self.node['unk-format']) + r'\n',
            'eos-format': self.__EOS_FORMAT,
        }
        # http://taku910.github.io/mecab/mecab.html
        if dicdir:
            option['dicdir'] = dicdir
        if userdics:
            option['userdic'] = ','.join(userdics)
        self.__option = ' '.join('--{}={}'.format(*c) for c in option.items())
        self.__tagger = Tagger(self.__option)

    def __repr__(self):
        return f'{self.__class__.__qualname__}({self.__option!r})'

    def __call__(self, text):
        res = self.__tagger.parse(text).rstrip().split('\n')
        return [Morpheme(**self.__parse_node(node)) for node in self.__tagger.parse(text).rstrip().split('\n') if node]

    def __parse_node(self, node):
        return dict(zip(self.node['keys'], node.split('\t')))
def _get_tagger() -> Tagger:
    opts = getenv('MECAB_OPTS', '-d /usr/lib/mecab/dic/mecab-ipadic-neologd/')
    tagger = Tagger(opts)
    # for some reason the first request to the tagger doesn't produce output
    # so pre-warming it here once to avoid serving daft results later
    parsed = tagger.parseToNode('サザエさんは走った')
    while parsed:
        parsed = parsed.next
    return tagger
def parse_full(sentence: str,
               parser: MeCab.Tagger,
               remove_delimiter: bool = False,
               delimiter: str = None):
    """
    Function to parse a given raw string into raw token and
        syntactic tags using a given MeCab tagger

    Args:
        sentence (str): Input string
        parser (MeCab.Tagger): Parser used to obtain syntactic tags
        remove_delimiter (bool, optional): If True, delimiter token is not
                present in output
        delimiter (str, optional): End-of-sentence delimiter token
                (i.e. period)

    Returns:
        (tuple): A tuple containing the following:
            nodes (list): A list of string tokens from the parsed %sentence%
            pos (list): A list of lists of strings. The nth list contains
                the syntactic tags corresponding to the nth token of %nodes%
    """
    if remove_delimiter:

        assert (delimiter is not None)
        sentence = sentence.replace(delimiter, '')

    sentence = re.sub(r'\s+', '', sentence.strip())

    len_parsed = 0

    nodes = list()
    pos = [list(), list(), list(), list(), list()]

    parser.parse('')
    res = parser.parseToNode(sentence)

    while res:

        len_parsed += len(res.surface)

        if res.surface != '':

            c = res.feature.split(",")
            c = resolve_syntactic_tags(c)

            for i in range(len(pos)):

                pos[i].append(c[i])

            nodes.append(res.surface)

        res = res.next

    assert (len_parsed == len(sentence))

    return nodes, pos
Exemple #5
0
 def __init__(self,UniDic,UDPipe):
   self.UniDic=UniDic
   if UniDic!=None:
     d=os.path.join(DOWNLOAD_DIR,UniDic)
     r=os.path.join(PACKAGE_DIR,"mecabrc")
     if os.path.isdir(d):
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       self.mecab=Tagger("-r "+r+" -d "+d).parse
     elif UniDic=="unidic-lite":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       import unidic_lite
       self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse
     elif UniDic=="ipadic":
       try:
         from MeCab import Tagger
       except:
         from fugashi import GenericTagger as Tagger
       try:
         import ipadic
         self.mecab=Tagger(ipadic.MECAB_ARGS).parse
       except:
         self.mecab=Tagger().parse
     else:
       d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" }
       self.dictkey=d[UniDic]
       self.mecab=self.ChamameWebAPI
   self.udpipe=self.UDPipeWebAPI
   if UDPipe==None:
     self.model="japanese-gsd"
   else:
     self.model=UDPipe
     m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe")
     if os.path.isfile(m):
       import ufal.udpipe
       self.model=ufal.udpipe.Model.load(m)
       if UniDic==None:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process
       else:
         self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process
     elif self.model.startswith("stanza_"):
       import stanza
       if UniDic==None:
         self.model=stanza.Pipeline(self.model[7:],verbose=False)
         from stanza.utils.conll import CoNLL
         self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict()))
       else:
         self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False)
         self.udpipe=self.StanzaAPI
Exemple #6
0
    def __init__(
        self,
        dicpath=dic_installed_path,
    ):

        try:
            self.tagger = Tagger('-d %s' % dicpath)
            self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' %
                                    module_installed_path)
        except RuntimeError:
            raise Exception(
                'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"'
                % dicpath)
Exemple #7
0
 def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'):
     self.dicpath = dicpath
     try:
         self.tagger = Tagger('-d %s' % dicpath)
         self.tagset = utils.read_json('%s/data/tagset/mecab.json' %
                                       utils.installpath)
     except RuntimeError:
         raise Exception(
             'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"'
             % dicpath)
     except NameError:
         raise Exception(
             'Install MeCab in order to use it: http://konlpy.org/en/latest/install/'
         )
Exemple #8
0
def parse():
    """Parse input from stdin.

    This is a simple wrapper for mecab-python3 so you can test it from the
    command line.  Like the mecab binary, it treats each line of stdin as one
    sentence. You can pass tagger arguments here too.
    """

    args = ' '.join(sys.argv[1:])
    tagger = Tagger(args)

    for line in fileinput.input([]):
        # strip the newline on output
        print(tagger.parse(line.strip())[:-1])
Exemple #9
0
def info():
    """Print configuration info."""
    args = ' '.join(sys.argv[1:])
    tagger = Tagger(args)
    di = tagger.dictionary_info()
    # TODO get the package version here too
    print("mecab-py dictionary info:")
    print("-----")
    while di:
        print('version:'.ljust(10), di.version)
        print('size:'.ljust(10), di.size)
        print('charset:'.ljust(10), di.charset)
        print('filename:'.ljust(10), di.filename)
        print("-----")
        di = di.next
Exemple #10
0
    def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs):
        self.node = node or self.__DEFAULT_NODE

        option = {
            'node-format': r'\t'.join(self.node['node-format']) + r'\n',
            'unk-format': r'\t'.join(self.node['unk-format']) + r'\n',
            'eos-format': self.__EOS_FORMAT,
        }
        # http://taku910.github.io/mecab/mecab.html
        if dicdir:
            option['dicdir'] = dicdir
        if userdics:
            option['userdic'] = ','.join(userdics)
        self.__option = ' '.join('--{}={}'.format(*c) for c in option.items())
        self.__tagger = Tagger(self.__option)
def tokenize_ja(text, tokenizer: MeCab.Tagger):
    words = []
    word_infos = tokenizer.parse(text).split('\n')[:-2]
    for word_info in word_infos:
        word_info = word_info.split('\t')
        words.append(word_info[2])
    return words
Exemple #12
0
 def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'):
     try:
         self.tagger = Tagger('-d %s' % dicpath)
     except RuntimeError:
         raise Exception(
             'Invalid MeCab dictionary path: "%s"\nInput the correct path when initiializing class: "Mecab(\'/some/dic/path\')"'
             % dicpath)
Exemple #13
0
class FeatureExtractor:
    def __init__(self):
        self.dictionary = {}
        self.categories = {}
        self.tagger = Tagger()

    def parse_document(self, text):
        document = text.split("\t", 4)

        if len(document) != 5:
            return None

        category_name = document[0]
        if not category_name in self.categories:
            self.categories[category_name] = len(self.categories)
        category_id = self.categories[category_name]

        return (category_id, document[-1])

    def feature_extract(self, text):
        morphology = self.tagger.parseToNode(text)
        bag_of_words = {}
        while morphology:
            features = morphology.feature.split(",")
            surface = morphology.surface

            if len(features) > 0 and features[0] == "名詞":
                if not surface in self.dictionary:
                    self.dictionary[surface] = len(self.dictionary)
                word_id = self.dictionary[surface]
                bag_of_words[word_id] = bag_of_words.get(word_id, 0) + 1

            morphology = morphology.next
        return bag_of_words
class Tokenizer():
    def __init__(self):
        self.tokenizer = Tagger("-Ochasen")

    def __call__(self, text):
        wakati = self.tokenizer.parse(text)
        wakati = [Wakati(f) for f in wakati.split('\n') if f != 'EOS' and f]
        return wakati
Exemple #15
0
 def _load_mecab(self: Tokenizer) -> None:
     if os.path.isdir(self.dictionary):
         # load local dictionary
         self.logger.info(f'loading local dictionary: {self.dictionary}')
         self.tagger = Tagger(f'-d {self.dictionary}')
         return
     elif self.dictionary not in self.INSTALLED_DICTIONARIES:
         raise ValueError(f'dictionary not found: {self.dictionary}')
     # load installed dictionary
     mecab_config_path = None
     # retrive the directory of dictionary
     mecab_config_cands = [
         '/usr/bin/mecab-config', '/usr/local/bin/mecab-config'
     ]
     for c in mecab_config_cands:
         if os.path.exists(c):
             mecab_config_path = c
             break
     if mecab_config_path is None:
         raise SystemError(
             'mecab-config not found. check mecab is really installed')
     dic_dir = subprocess.run([mecab_config_path, '--dicdir'],
                              check=True,
                              stdout=subprocess.PIPE,
                              text=True).stdout.rstrip()
     # retrive the dictonary
     dic_path = None
     if self.dictionary == 'ipa':
         dic_cands = ['ipadic-utf8', 'ipadic']
     elif self.dictionary == 'juman':
         dic_cands = ['juman-utf8', 'jumandic']
     else:  # self.dictionary == 'neologd'
         dic_cands = ['mecab-ipadic-neologd']
     for c in dic_cands:
         tmpdir = os.path.join(dic_dir, c)
         if os.path.isdir(tmpdir):
             dic_path = tmpdir
             break
     if dic_path is None:
         raise SystemError(
             f'installed dictionary not found: {self.dictionary}')
     # create tagger
     self.logger.info(f'loading installed dictionary: {self.dictionary}')
     self.tagger = Tagger(f'-d{dic_path}')
     return
Exemple #16
0
 def tokenize(tokenizer: MeCab.Tagger, text):
     words = []
     word_infos = tokenizer.parse(text).split('\n')[:-2]
     for word_info in word_infos:
         word_info = word_info.split('\t')
         if '名詞' in word_info[3] or '動詞' in word_info[
                 3] or '形容詞' in word_info[3]:
             words.append(word_info[2])
     return words
Exemple #17
0
def parse_string(string: str, mecab: Tagger) -> List[str]:
    parsed = []
    node = mecab.parseToNode(string)
    while node:
        if node.surface != "":
            parsed.append(node.surface)

        node = node.next

    return parsed
Exemple #18
0
def parse_to_node(text: str, tagger: MeCab.Tagger) -> Iterator[Node]:
    """文字列の解析

    このパッケージが提供する機能をスムーズに使用できる形のオブジェクトを返す。

    Returns:
        解析結果として最有力候補となるNodeを文字列の先頭から順に与えるイテレータ
    """
    parsed_text: str = tagger.parse(text)
    return map(_word_line_to_node, parsed_text.rstrip('\n').split('\n'))
Exemple #19
0
 def parse(self: SeqMorpheme, tagger: Tagger) -> None:
     pos = 0
     for node in tagger.parse(self.sentence).splitlines():
         node = node.strip()
         if node == 'EOS':
             break
         morpheme = Morpheme(dictionary=self.dictionary,
                             node=node,
                             pos=pos,
                             logger=self.logger)
         self.morphemes.append(morpheme)
         pos += len(morpheme)
     self.logger.debug(
         f'len(setence)={self.length}, sum(len(morpheme))={pos}')
     assert self.length == pos
     return
class MeCabLight:
    def __init__(self):
        self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')

    def parse_mecab_output(self, output):
        lines = output.splitlines()[:-1]
        branch = []
        for line in lines:
            morph, rest = line.split('\t', 1)
            sejongtag = rest.split(',', 1)[0]
            branch.append((morph, sejongtag))
        return branch

    def pos(self, passage):
        if not (type(passage) is str):
            Exception("Passage is not basestring!")
        words = passage.split()
        branches = [
            self.parse_mecab_output(self.tagger.parse(word)) for word in words
        ]
        return branches
Exemple #21
0
class Mecab(object):
    tagger = Tagger()

    def morphs(self, phrase):
        return [s for s, t in self.pos(phrase)]

    def extract_ngram_corpus(self, phrase):
        tagged = self.pos(phrase)
        return [s for s, t in tagged if not t.startswith("S")]

    def nouns(self, phrase):
        tagged = self.pos(phrase)
        return [
            s for s, t in tagged
            if t[:1] in ("N", ) or t[:2] in ("XR", "SL", "SH")
        ]

    def nouns_and_verbs(self, phrase):
        tagged = self.pos(phrase)
        return [
            s for s, t in tagged
            if t[:1] in ("N", "V") or t[:2] in ("XR", "SL", "SH")
        ]

    def without_josa(self, phrase):
        tagged = self.pos(phrase)
        return [s for s, t in tagged if not t.startswith("J")]

    def pos(self, phrase):
        return self.parse(self.tagger.parse(phrase))

    @classmethod
    def parse(cls, result):
        def split(elem):
            if not elem:
                return ("", "SY")
            s, t = elem.split("\t")
            return (s, t.split(",", 1)[0])

        return [split(elem) for elem in result.splitlines()[:-1]]
Exemple #22
0
def create_mecab(arg="") -> Tagger:
    mecab = Tagger(arg)
    mecab.parse("")  # dummy
    return mecab
 def update_data(inputs=""):
     from MeCab import Tagger
     t = Tagger('-Owakati')
     data = [term if vocab.get(term) != None else "___UNK___" for term in t.parse(inputs).strip().split(' ')]
     TextList.data = data
Exemple #24
0
#$ python3 word_cloud.py -d /usr/lib/aarch64-linux-gnu/mecab/dic/mecab-ipadic-neologd

from MeCab import Tagger
import argparse
import matplotlib.pyplot as plt
from wordcloud import WordCloud

parser = argparse.ArgumentParser(description="convert csv")
parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary")
args = parser.parse_args()

t = Tagger()
#t = Tagger(" -d " + args.dictionary)
#t = Tagger("-Ochasen" + ("" if not args.dictionary else " -d " + args.dictionary))

text = "名城大(名古屋市)は25日、リチウムイオン電池の開発でノーベル化学賞を受賞した同大学教授で旭化成名誉フェローの吉野彰さん(72)に「特別栄誉教授」の称号を授与した。吉野さんは2017年から、大学院理工学研究科の教授を務めており、週1回の講義を受け持っている。名城大によると、特別栄誉教授はノーベル賞を受賞した教員などをたたえるための称号。14年に終身教授の赤崎勇さんと元教授の天野浩さんが、青色発光ダイオード(LED)の開発でノーベル物理学賞を受賞したことをきっかけに創設した。"

splitted = " ".join(
    [x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]])
print("1", splitted)
wc = WordCloud(font_path="/home/muauan/.fonts/NotoSansCJKjp-Regular.otf")
wc.generate(splitted)
plt.axis("off")
plt.imshow(wc)
plt.pause(1)
plt.savefig('./output_images/yosino0_{}.png'.format(text[0]))
plt.close()

splitted = " ".join([
    x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]
    if x.split("\t")[1].split(",")[0] not in ["助詞", "助動詞", "副詞", "連体詞"]
 def __init__(self):
     self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')
Exemple #26
0
 def __init__(self):
     self.dictionary = {}
     self.categories = {}
     self.tagger = Tagger()
Exemple #27
0
import MeCab
from MeCab import Tagger
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import argparse

parser = argparse.ArgumentParser(description="convert csv")
parser.add_argument("input", type=str, help="csv file")
parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary")
parser.add_argument("--stop_words", "-s", type=str, help="stop words list")
args = parser.parse_args()

mecab = MeCab.Tagger("-Owakati" +
                     ("" if not args.dictionary else " -d " + args.dictionary))
t = Tagger(" -d " + args.dictionary)

questions = []
questions_ = []


def train_conv(mecab, input_file, encoding):
    questions = []
    questions_ = []
    with open(input_file, encoding=encoding) as f:
        cols = f.read().strip().split('\n')
        for i in range(len(cols)):
            questions.append(mecab.parse(cols[i]).strip())
            questions_.append(cols[i])
    return questions, questions_
Exemple #28
0
import re

from pathlib import Path

from MeCab import Tagger

m = Tagger('-Ochasen')
stopwords = [line.strip() for line in Path('dict/stopwords_ja.txt').open()]


# Convert all Japanese conjugated words to the dictionary form(終止形)
def deconjugate_sentence(sentence):
    # Remove EOS
    words = m.parse(sentence).splitlines()[:-1]
    sentences = []

    for word in words:
        tags = word.split()

        sentences.append(tags[2])

    return sentences


# Remove stopwords from a list of words (a sentence splitted by words)
def remove_stopwords(words):
    return [word for word in words if word not in stopwords]


def extract_nouns(sentence):
    words = [word.split() for word in m.parse(sentence).splitlines()][:-1]
Exemple #29
0
# coding:utf-8
from MeCab import Tagger
import codecs
import pickle

tagger = Tagger("-Ochasen")

words = []

with codecs.open("tweets", "r") as f:
    tweets = f.read().replace("\n", "。")
    tagger.parseToNode("")
    result = tagger.parseToNode(tweets)
    while result:
        # 眠いから、根本解決諦めた
        # unicodeバグるの死んでくれ
        try:
            words.append(result.surface)
        except:
            print("tsurai")
        result = result.next

    vocab = {}
    dataset = []
    for i, word in enumerate(words):
        if i == 0:
            continue
        if word not in vocab:
            vocab[word] = len(vocab)
        dataset.append(vocab[word])
Exemple #30
0
#!/usr/bin/env python
#encoding: utf-8
from MeCab import Tagger

# 入力する文字列
text = "こんにちは世界!"
tagger = Tagger()

# 簡易的な解析
print tagger.parse(text)

# 単語ごとの情報を取り出す
node = tagger.parseToNode(text)
while node:
    print node.surface, node.feature
    node = node.next

 def __init__(self):
     self.tokenizer = Tagger("-Ochasen")
Exemple #32
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
from MeCab import Tagger
from pyknp import Juman
text = ""
f = codecs.open('pyro.txt', 'r', 'utf-8')
fin = codecs.open('mecab.txt', 'a', 'utf-8')
fin1 = codecs.open('juman.txt', 'a', 'utf-8')
m = Tagger("-Owakati")
juman = Juman()
for line in f:
    target_text = line
    inp = m.parse(target_text)
    fin.write(inp)
    #result = juman.analysis(target_text)
    #inp1=(' '.join([mrph.midasi for mrph in result.mrph_list()]))
    #fin1.write(inp1)
print("終了")
f.close()

##juman++で実行すると途中で書式のエラーが発生した(コーディングを変えればOK…かな?)
##したがって扱うテキストファイルはmecabのものとする
##同じディレクトリにnuc.zipを解凍したものを配置すれば実行できる
Exemple #33
0
class UDKanbun(object):
    def __init__(self, mecab, danku, model):
        import ufal.udpipe
        if model == None:
            m = ufal.udpipe.Model.load(
                os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe"))
        else:
            m = ufal.udpipe.Model.load(model)
        self.model = m
        if mecab:
            try:
                from MeCab import Tagger
            except:
                from fugashi import GenericTagger as Tagger
            self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") +
                                " -d " +
                                os.path.join(PACKAGE_DIR, "mecab-kanbun"))
            self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "")
        else:
            self.mecab = False
            if danku:
                self.udpipe = ufal.udpipe.Pipeline(
                    m, "tokenizer=joint_with_parsing", "", "", "")
            else:
                self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented",
                                                   "", "", "")
        self.danku = danku

    def __call__(self, sentence, raw=False):
        if self.mecab:
            if self.danku == False:
                p = sentence.replace("\u3001",
                                     "\u3001\n").replace("\u3002", "\u3002\n")
            elif self.danku == True:
                import udkanbun.danku
                try:
                    self.danku = udkanbun.danku.SegShenShen()
                    p = self.danku(sentence)
                except:
                    self.danku = udkanbun.danku.SegUDKanbun()
                    p = self.danku(sentence)
            else:
                p = self.danku(sentence)
            u = ""
            id = 1
            for s in p.split("\n"):
                if s == "":
                    continue
                m = self.mecab.parse(s)
                u += "# text = " + s + "\n"
                for w in m.split("\n"):
                    if w == "EOS":
                        u += "\n"
                        id = 1
                    elif w != "":
                        s = w.split("\t")
                        t = s[1].split(",")
                        lemma = s[0] if t[6] == "*" else t[6]
                        misc = "SpaceAfter=No" if t[
                            9] == "*" else "Gloss=" + t[9] + "|SpaceAfter=No"
                        u += "\t".join([
                            str(id), s[0], lemma, t[7],
                            t[0] + "," + t[1] + "," + t[2] + "," + t[3],
                            t[8].replace("*", "_"), "_", "_", "_", misc
                        ]) + "\n"
                        id += 1
        elif self.danku == False:
            u = sentence.replace("\u3002", "\u3002\n").replace(
                "\uFF0E", "\uFF0E\n").replace(".", ".\n")
        else:
            u = sentence
        if raw:
            return self.udpipe.process(u)
        else:
            return UDKanbunEntry(self.udpipe.process(u))
Exemple #34
0
class Mecab():
    """Wrapper for MeCab-ko morphological analyzer.

    `MeCab`_, originally a Japanese morphological analyzer and POS tagger
    developed by the Graduate School of Informatics in Kyoto University,
    was modified to MeCab-ko by the `Eunjeon Project`_
    to adapt to the Korean language.

    In order to use MeCab-ko within KoNLPy, follow the directions in
    :ref:`optional-installations`.

    .. code-block:: python
        :emphasize-lines: 1
        >>> from unipy_nlp.tagger import Mecab
        >>> mecab = Mecab()
        >>> print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))
        ['영등포구', '청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']
        >>> print(mecab.nouns(u'우리나라에는 무릎 치료를 잘하는 정형외과가 없는가!'))
        ['우리', '나라', '무릎', '치료', '정형외과']
        >>> print(mecab.pos(u'자연주의 쇼핑몰은 어떤 곳인가?'))
        [('자연', 'NNG'), ('주', 'NNG'), ('의', 'JKG'), ('쇼핑몰', 'NNG'), ('은', 'JX'), ('어떤', 'MM'), ('곳', 'NNG'), ('인가', 'VCP+EF'), ('?', 'SF')]

    :param dicpath: The path of the MeCab-ko dictionary.

    .. _MeCab: https://taku910.github.io/mecab/
    .. _Eunjeon Project: http://eunjeon.blogspot.kr/
    """
    def __init__(
        self,
        dicpath=dic_installed_path,
    ):

        try:
            self.tagger = Tagger('-d %s' % dicpath)
            self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' %
                                    module_installed_path)
        except RuntimeError:
            raise Exception(
                'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"'
                % dicpath)
        # except NameError:
        #     raise Exception('Install MeCab in order to use it: http://konlpy.org/en/latest/install/')

    # TODO: check whether flattened results equal non-flattened
    def pos(self, phrase, flatten=True, join=False):
        """POS tagger.

        :param flatten: If False, preserves eojeols.
        :param join: If True, returns joined sets of morph and tag.
        """

        if sys.version_info[0] < 3:
            phrase = phrase.encode('utf-8')
            if flatten:
                result = self.tagger.parse(phrase).decode('utf-8')
                return parse(result, join=join)
            else:
                return [
                    parse(
                        self.tagger.parse(eojeol).decode('utf-8'),
                        join=join,
                    ) for eojeol in phrase.split()
                ]

        else:
            if flatten:
                result = self.tagger.parse(phrase)
                return parse(result, join=join)
            else:
                return [
                    parse(
                        self.tagger.parse(eojeol).decode('utf-8'),
                        join=join,
                    ) for eojeol in phrase.split()
                ]

    def morphs(self, phrase):
        """Parse phrase to morphemes."""

        return [s for s, t in self.pos(phrase)]

    def nouns(self, phrase):
        """Noun extractor."""

        tagged = self.pos(phrase)
        return [s for s, t in tagged if t.startswith('N')]
Exemple #35
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from glob import glob
files = glob("*.txt")

from MeCab import Tagger
mecab = Tagger("-F%f[6]\\t%m\\n -E\ ")

def get_first(s):
    s = s.split("\t")
    if len(s) < 2:
        return ''
    return s[0] or s[1]

for filename in files:
	with open(filename) as f:
	    lines = [list(filter(lambda s: s, [get_first(s) for s in mecab.parse(line).split('\n')])) for line in f]

	for word_n in range(1, 4):
		from collections import defaultdict
		d = defaultdict(int)
		tokens = 0

		for line in lines:
			tokens += max(0, len(line) - word_n + 1)
			for i in range(0, len(line) - word_n + 1):
				d["".join(line[i:i+word_n])] += 1

		with open(filename + "." + str(word_n) + "word", mode="w") as f_nword:
			sum = 0