def __init__(self, mecab, danku, model): import ufal.udpipe if model == None: m = ufal.udpipe.Model.load( os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe")) else: m = ufal.udpipe.Model.load(model) self.model = m if mecab: try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") + " -d " + os.path.join(PACKAGE_DIR, "mecab-kanbun")) self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "") else: self.mecab = False if danku: self.udpipe = ufal.udpipe.Pipeline( m, "tokenizer=joint_with_parsing", "", "", "") else: self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented", "", "", "") self.danku = danku
class WordParser(): # http://taku910.github.io/mecab/format.html __DEFAULT_NODE = { 'keys': ('surface', 'lexeme', 'pos'), 'node-format': ('%H', '%f[6]', '%F-[0,1,2,3]'), 'unk-format': ('%m', '%m', '%F-[0,1,2,3]'), } __EOS_FORMAT = '' def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs): self.node = node or self.__DEFAULT_NODE option = { 'node-format': r'\t'.join(self.node['node-format']) + r'\n', 'unk-format': r'\t'.join(self.node['unk-format']) + r'\n', 'eos-format': self.__EOS_FORMAT, } # http://taku910.github.io/mecab/mecab.html if dicdir: option['dicdir'] = dicdir if userdics: option['userdic'] = ','.join(userdics) self.__option = ' '.join('--{}={}'.format(*c) for c in option.items()) self.__tagger = Tagger(self.__option) def __repr__(self): return f'{self.__class__.__qualname__}({self.__option!r})' def __call__(self, text): res = self.__tagger.parse(text).rstrip().split('\n') return [Morpheme(**self.__parse_node(node)) for node in self.__tagger.parse(text).rstrip().split('\n') if node] def __parse_node(self, node): return dict(zip(self.node['keys'], node.split('\t')))
def _get_tagger() -> Tagger: opts = getenv('MECAB_OPTS', '-d /usr/lib/mecab/dic/mecab-ipadic-neologd/') tagger = Tagger(opts) # for some reason the first request to the tagger doesn't produce output # so pre-warming it here once to avoid serving daft results later parsed = tagger.parseToNode('サザエさんは走った') while parsed: parsed = parsed.next return tagger
def parse_full(sentence: str, parser: MeCab.Tagger, remove_delimiter: bool = False, delimiter: str = None): """ Function to parse a given raw string into raw token and syntactic tags using a given MeCab tagger Args: sentence (str): Input string parser (MeCab.Tagger): Parser used to obtain syntactic tags remove_delimiter (bool, optional): If True, delimiter token is not present in output delimiter (str, optional): End-of-sentence delimiter token (i.e. period) Returns: (tuple): A tuple containing the following: nodes (list): A list of string tokens from the parsed %sentence% pos (list): A list of lists of strings. The nth list contains the syntactic tags corresponding to the nth token of %nodes% """ if remove_delimiter: assert (delimiter is not None) sentence = sentence.replace(delimiter, '') sentence = re.sub(r'\s+', '', sentence.strip()) len_parsed = 0 nodes = list() pos = [list(), list(), list(), list(), list()] parser.parse('') res = parser.parseToNode(sentence) while res: len_parsed += len(res.surface) if res.surface != '': c = res.feature.split(",") c = resolve_syntactic_tags(c) for i in range(len(pos)): pos[i].append(c[i]) nodes.append(res.surface) res = res.next assert (len_parsed == len(sentence)) return nodes, pos
def __init__(self,UniDic,UDPipe): self.UniDic=UniDic if UniDic!=None: d=os.path.join(DOWNLOAD_DIR,UniDic) r=os.path.join(PACKAGE_DIR,"mecabrc") if os.path.isdir(d): try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab=Tagger("-r "+r+" -d "+d).parse elif UniDic=="unidic-lite": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger import unidic_lite self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse elif UniDic=="ipadic": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger try: import ipadic self.mecab=Tagger(ipadic.MECAB_ARGS).parse except: self.mecab=Tagger().parse else: d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" } self.dictkey=d[UniDic] self.mecab=self.ChamameWebAPI self.udpipe=self.UDPipeWebAPI if UDPipe==None: self.model="japanese-gsd" else: self.model=UDPipe m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe") if os.path.isfile(m): import ufal.udpipe self.model=ufal.udpipe.Model.load(m) if UniDic==None: self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process else: self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process elif self.model.startswith("stanza_"): import stanza if UniDic==None: self.model=stanza.Pipeline(self.model[7:],verbose=False) from stanza.utils.conll import CoNLL self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict())) else: self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False) self.udpipe=self.StanzaAPI
def __init__( self, dicpath=dic_installed_path, ): try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' % module_installed_path) except RuntimeError: raise Exception( 'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath)
def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'): self.dicpath = dicpath try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = utils.read_json('%s/data/tagset/mecab.json' % utils.installpath) except RuntimeError: raise Exception( 'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath) except NameError: raise Exception( 'Install MeCab in order to use it: http://konlpy.org/en/latest/install/' )
def parse(): """Parse input from stdin. This is a simple wrapper for mecab-python3 so you can test it from the command line. Like the mecab binary, it treats each line of stdin as one sentence. You can pass tagger arguments here too. """ args = ' '.join(sys.argv[1:]) tagger = Tagger(args) for line in fileinput.input([]): # strip the newline on output print(tagger.parse(line.strip())[:-1])
def info(): """Print configuration info.""" args = ' '.join(sys.argv[1:]) tagger = Tagger(args) di = tagger.dictionary_info() # TODO get the package version here too print("mecab-py dictionary info:") print("-----") while di: print('version:'.ljust(10), di.version) print('size:'.ljust(10), di.size) print('charset:'.ljust(10), di.charset) print('filename:'.ljust(10), di.filename) print("-----") di = di.next
def __init__(self, dicdir=None, userdics=None, node=None, *args, **kwargs): self.node = node or self.__DEFAULT_NODE option = { 'node-format': r'\t'.join(self.node['node-format']) + r'\n', 'unk-format': r'\t'.join(self.node['unk-format']) + r'\n', 'eos-format': self.__EOS_FORMAT, } # http://taku910.github.io/mecab/mecab.html if dicdir: option['dicdir'] = dicdir if userdics: option['userdic'] = ','.join(userdics) self.__option = ' '.join('--{}={}'.format(*c) for c in option.items()) self.__tagger = Tagger(self.__option)
def tokenize_ja(text, tokenizer: MeCab.Tagger): words = [] word_infos = tokenizer.parse(text).split('\n')[:-2] for word_info in word_infos: word_info = word_info.split('\t') words.append(word_info[2]) return words
def __init__(self, dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'): try: self.tagger = Tagger('-d %s' % dicpath) except RuntimeError: raise Exception( 'Invalid MeCab dictionary path: "%s"\nInput the correct path when initiializing class: "Mecab(\'/some/dic/path\')"' % dicpath)
class FeatureExtractor: def __init__(self): self.dictionary = {} self.categories = {} self.tagger = Tagger() def parse_document(self, text): document = text.split("\t", 4) if len(document) != 5: return None category_name = document[0] if not category_name in self.categories: self.categories[category_name] = len(self.categories) category_id = self.categories[category_name] return (category_id, document[-1]) def feature_extract(self, text): morphology = self.tagger.parseToNode(text) bag_of_words = {} while morphology: features = morphology.feature.split(",") surface = morphology.surface if len(features) > 0 and features[0] == "名詞": if not surface in self.dictionary: self.dictionary[surface] = len(self.dictionary) word_id = self.dictionary[surface] bag_of_words[word_id] = bag_of_words.get(word_id, 0) + 1 morphology = morphology.next return bag_of_words
class Tokenizer(): def __init__(self): self.tokenizer = Tagger("-Ochasen") def __call__(self, text): wakati = self.tokenizer.parse(text) wakati = [Wakati(f) for f in wakati.split('\n') if f != 'EOS' and f] return wakati
def _load_mecab(self: Tokenizer) -> None: if os.path.isdir(self.dictionary): # load local dictionary self.logger.info(f'loading local dictionary: {self.dictionary}') self.tagger = Tagger(f'-d {self.dictionary}') return elif self.dictionary not in self.INSTALLED_DICTIONARIES: raise ValueError(f'dictionary not found: {self.dictionary}') # load installed dictionary mecab_config_path = None # retrive the directory of dictionary mecab_config_cands = [ '/usr/bin/mecab-config', '/usr/local/bin/mecab-config' ] for c in mecab_config_cands: if os.path.exists(c): mecab_config_path = c break if mecab_config_path is None: raise SystemError( 'mecab-config not found. check mecab is really installed') dic_dir = subprocess.run([mecab_config_path, '--dicdir'], check=True, stdout=subprocess.PIPE, text=True).stdout.rstrip() # retrive the dictonary dic_path = None if self.dictionary == 'ipa': dic_cands = ['ipadic-utf8', 'ipadic'] elif self.dictionary == 'juman': dic_cands = ['juman-utf8', 'jumandic'] else: # self.dictionary == 'neologd' dic_cands = ['mecab-ipadic-neologd'] for c in dic_cands: tmpdir = os.path.join(dic_dir, c) if os.path.isdir(tmpdir): dic_path = tmpdir break if dic_path is None: raise SystemError( f'installed dictionary not found: {self.dictionary}') # create tagger self.logger.info(f'loading installed dictionary: {self.dictionary}') self.tagger = Tagger(f'-d{dic_path}') return
def tokenize(tokenizer: MeCab.Tagger, text): words = [] word_infos = tokenizer.parse(text).split('\n')[:-2] for word_info in word_infos: word_info = word_info.split('\t') if '名詞' in word_info[3] or '動詞' in word_info[ 3] or '形容詞' in word_info[3]: words.append(word_info[2]) return words
def parse_string(string: str, mecab: Tagger) -> List[str]: parsed = [] node = mecab.parseToNode(string) while node: if node.surface != "": parsed.append(node.surface) node = node.next return parsed
def parse_to_node(text: str, tagger: MeCab.Tagger) -> Iterator[Node]: """文字列の解析 このパッケージが提供する機能をスムーズに使用できる形のオブジェクトを返す。 Returns: 解析結果として最有力候補となるNodeを文字列の先頭から順に与えるイテレータ """ parsed_text: str = tagger.parse(text) return map(_word_line_to_node, parsed_text.rstrip('\n').split('\n'))
def parse(self: SeqMorpheme, tagger: Tagger) -> None: pos = 0 for node in tagger.parse(self.sentence).splitlines(): node = node.strip() if node == 'EOS': break morpheme = Morpheme(dictionary=self.dictionary, node=node, pos=pos, logger=self.logger) self.morphemes.append(morpheme) pos += len(morpheme) self.logger.debug( f'len(setence)={self.length}, sum(len(morpheme))={pos}') assert self.length == pos return
class MeCabLight: def __init__(self): self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic') def parse_mecab_output(self, output): lines = output.splitlines()[:-1] branch = [] for line in lines: morph, rest = line.split('\t', 1) sejongtag = rest.split(',', 1)[0] branch.append((morph, sejongtag)) return branch def pos(self, passage): if not (type(passage) is str): Exception("Passage is not basestring!") words = passage.split() branches = [ self.parse_mecab_output(self.tagger.parse(word)) for word in words ] return branches
class Mecab(object): tagger = Tagger() def morphs(self, phrase): return [s for s, t in self.pos(phrase)] def extract_ngram_corpus(self, phrase): tagged = self.pos(phrase) return [s for s, t in tagged if not t.startswith("S")] def nouns(self, phrase): tagged = self.pos(phrase) return [ s for s, t in tagged if t[:1] in ("N", ) or t[:2] in ("XR", "SL", "SH") ] def nouns_and_verbs(self, phrase): tagged = self.pos(phrase) return [ s for s, t in tagged if t[:1] in ("N", "V") or t[:2] in ("XR", "SL", "SH") ] def without_josa(self, phrase): tagged = self.pos(phrase) return [s for s, t in tagged if not t.startswith("J")] def pos(self, phrase): return self.parse(self.tagger.parse(phrase)) @classmethod def parse(cls, result): def split(elem): if not elem: return ("", "SY") s, t = elem.split("\t") return (s, t.split(",", 1)[0]) return [split(elem) for elem in result.splitlines()[:-1]]
def create_mecab(arg="") -> Tagger: mecab = Tagger(arg) mecab.parse("") # dummy return mecab
def update_data(inputs=""): from MeCab import Tagger t = Tagger('-Owakati') data = [term if vocab.get(term) != None else "___UNK___" for term in t.parse(inputs).strip().split(' ')] TextList.data = data
#$ python3 word_cloud.py -d /usr/lib/aarch64-linux-gnu/mecab/dic/mecab-ipadic-neologd from MeCab import Tagger import argparse import matplotlib.pyplot as plt from wordcloud import WordCloud parser = argparse.ArgumentParser(description="convert csv") parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary") args = parser.parse_args() t = Tagger() #t = Tagger(" -d " + args.dictionary) #t = Tagger("-Ochasen" + ("" if not args.dictionary else " -d " + args.dictionary)) text = "名城大(名古屋市)は25日、リチウムイオン電池の開発でノーベル化学賞を受賞した同大学教授で旭化成名誉フェローの吉野彰さん(72)に「特別栄誉教授」の称号を授与した。吉野さんは2017年から、大学院理工学研究科の教授を務めており、週1回の講義を受け持っている。名城大によると、特別栄誉教授はノーベル賞を受賞した教員などをたたえるための称号。14年に終身教授の赤崎勇さんと元教授の天野浩さんが、青色発光ダイオード(LED)の開発でノーベル物理学賞を受賞したことをきっかけに創設した。" splitted = " ".join( [x.split("\t")[0] for x in t.parse(text).splitlines()[:-1]]) print("1", splitted) wc = WordCloud(font_path="/home/muauan/.fonts/NotoSansCJKjp-Regular.otf") wc.generate(splitted) plt.axis("off") plt.imshow(wc) plt.pause(1) plt.savefig('./output_images/yosino0_{}.png'.format(text[0])) plt.close() splitted = " ".join([ x.split("\t")[0] for x in t.parse(text).splitlines()[:-1] if x.split("\t")[1].split(",")[0] not in ["助詞", "助動詞", "副詞", "連体詞"]
def __init__(self): self.tagger = Tagger('-d /usr/local/lib/mecab/dic/mecab-ko-dic')
def __init__(self): self.dictionary = {} self.categories = {} self.tagger = Tagger()
import MeCab from MeCab import Tagger from wordcloud import WordCloud import matplotlib.pyplot as plt import argparse parser = argparse.ArgumentParser(description="convert csv") parser.add_argument("input", type=str, help="csv file") parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary") parser.add_argument("--stop_words", "-s", type=str, help="stop words list") args = parser.parse_args() mecab = MeCab.Tagger("-Owakati" + ("" if not args.dictionary else " -d " + args.dictionary)) t = Tagger(" -d " + args.dictionary) questions = [] questions_ = [] def train_conv(mecab, input_file, encoding): questions = [] questions_ = [] with open(input_file, encoding=encoding) as f: cols = f.read().strip().split('\n') for i in range(len(cols)): questions.append(mecab.parse(cols[i]).strip()) questions_.append(cols[i]) return questions, questions_
import re from pathlib import Path from MeCab import Tagger m = Tagger('-Ochasen') stopwords = [line.strip() for line in Path('dict/stopwords_ja.txt').open()] # Convert all Japanese conjugated words to the dictionary form(終止形) def deconjugate_sentence(sentence): # Remove EOS words = m.parse(sentence).splitlines()[:-1] sentences = [] for word in words: tags = word.split() sentences.append(tags[2]) return sentences # Remove stopwords from a list of words (a sentence splitted by words) def remove_stopwords(words): return [word for word in words if word not in stopwords] def extract_nouns(sentence): words = [word.split() for word in m.parse(sentence).splitlines()][:-1]
# coding:utf-8 from MeCab import Tagger import codecs import pickle tagger = Tagger("-Ochasen") words = [] with codecs.open("tweets", "r") as f: tweets = f.read().replace("\n", "。") tagger.parseToNode("") result = tagger.parseToNode(tweets) while result: # 眠いから、根本解決諦めた # unicodeバグるの死んでくれ try: words.append(result.surface) except: print("tsurai") result = result.next vocab = {} dataset = [] for i, word in enumerate(words): if i == 0: continue if word not in vocab: vocab[word] = len(vocab) dataset.append(vocab[word])
#!/usr/bin/env python #encoding: utf-8 from MeCab import Tagger # 入力する文字列 text = "こんにちは世界!" tagger = Tagger() # 簡易的な解析 print tagger.parse(text) # 単語ごとの情報を取り出す node = tagger.parseToNode(text) while node: print node.surface, node.feature node = node.next
def __init__(self): self.tokenizer = Tagger("-Ochasen")
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs from MeCab import Tagger from pyknp import Juman text = "" f = codecs.open('pyro.txt', 'r', 'utf-8') fin = codecs.open('mecab.txt', 'a', 'utf-8') fin1 = codecs.open('juman.txt', 'a', 'utf-8') m = Tagger("-Owakati") juman = Juman() for line in f: target_text = line inp = m.parse(target_text) fin.write(inp) #result = juman.analysis(target_text) #inp1=(' '.join([mrph.midasi for mrph in result.mrph_list()])) #fin1.write(inp1) print("終了") f.close() ##juman++で実行すると途中で書式のエラーが発生した(コーディングを変えればOK…かな?) ##したがって扱うテキストファイルはmecabのものとする ##同じディレクトリにnuc.zipを解凍したものを配置すれば実行できる
class UDKanbun(object): def __init__(self, mecab, danku, model): import ufal.udpipe if model == None: m = ufal.udpipe.Model.load( os.path.join(PACKAGE_DIR, "ud-kanbun.udpipe")) else: m = ufal.udpipe.Model.load(model) self.model = m if mecab: try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab = Tagger("-r " + os.path.join(PACKAGE_DIR, "mecabrc") + " -d " + os.path.join(PACKAGE_DIR, "mecab-kanbun")) self.udpipe = ufal.udpipe.Pipeline(m, "conllu", "none", "", "") else: self.mecab = False if danku: self.udpipe = ufal.udpipe.Pipeline( m, "tokenizer=joint_with_parsing", "", "", "") else: self.udpipe = ufal.udpipe.Pipeline(m, "tokenizer=presegmented", "", "", "") self.danku = danku def __call__(self, sentence, raw=False): if self.mecab: if self.danku == False: p = sentence.replace("\u3001", "\u3001\n").replace("\u3002", "\u3002\n") elif self.danku == True: import udkanbun.danku try: self.danku = udkanbun.danku.SegShenShen() p = self.danku(sentence) except: self.danku = udkanbun.danku.SegUDKanbun() p = self.danku(sentence) else: p = self.danku(sentence) u = "" id = 1 for s in p.split("\n"): if s == "": continue m = self.mecab.parse(s) u += "# text = " + s + "\n" for w in m.split("\n"): if w == "EOS": u += "\n" id = 1 elif w != "": s = w.split("\t") t = s[1].split(",") lemma = s[0] if t[6] == "*" else t[6] misc = "SpaceAfter=No" if t[ 9] == "*" else "Gloss=" + t[9] + "|SpaceAfter=No" u += "\t".join([ str(id), s[0], lemma, t[7], t[0] + "," + t[1] + "," + t[2] + "," + t[3], t[8].replace("*", "_"), "_", "_", "_", misc ]) + "\n" id += 1 elif self.danku == False: u = sentence.replace("\u3002", "\u3002\n").replace( "\uFF0E", "\uFF0E\n").replace(".", ".\n") else: u = sentence if raw: return self.udpipe.process(u) else: return UDKanbunEntry(self.udpipe.process(u))
class Mecab(): """Wrapper for MeCab-ko morphological analyzer. `MeCab`_, originally a Japanese morphological analyzer and POS tagger developed by the Graduate School of Informatics in Kyoto University, was modified to MeCab-ko by the `Eunjeon Project`_ to adapt to the Korean language. In order to use MeCab-ko within KoNLPy, follow the directions in :ref:`optional-installations`. .. code-block:: python :emphasize-lines: 1 >>> from unipy_nlp.tagger import Mecab >>> mecab = Mecab() >>> print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) ['영등포구', '청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.'] >>> print(mecab.nouns(u'우리나라에는 무릎 치료를 잘하는 정형외과가 없는가!')) ['우리', '나라', '무릎', '치료', '정형외과'] >>> print(mecab.pos(u'자연주의 쇼핑몰은 어떤 곳인가?')) [('자연', 'NNG'), ('주', 'NNG'), ('의', 'JKG'), ('쇼핑몰', 'NNG'), ('은', 'JX'), ('어떤', 'MM'), ('곳', 'NNG'), ('인가', 'VCP+EF'), ('?', 'SF')] :param dicpath: The path of the MeCab-ko dictionary. .. _MeCab: https://taku910.github.io/mecab/ .. _Eunjeon Project: http://eunjeon.blogspot.kr/ """ def __init__( self, dicpath=dic_installed_path, ): try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = read_json('%s/_resources/mecab/mecab_tagset.json' % module_installed_path) except RuntimeError: raise Exception( 'The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath) # except NameError: # raise Exception('Install MeCab in order to use it: http://konlpy.org/en/latest/install/') # TODO: check whether flattened results equal non-flattened def pos(self, phrase, flatten=True, join=False): """POS tagger. :param flatten: If False, preserves eojeols. :param join: If True, returns joined sets of morph and tag. """ if sys.version_info[0] < 3: phrase = phrase.encode('utf-8') if flatten: result = self.tagger.parse(phrase).decode('utf-8') return parse(result, join=join) else: return [ parse( self.tagger.parse(eojeol).decode('utf-8'), join=join, ) for eojeol in phrase.split() ] else: if flatten: result = self.tagger.parse(phrase) return parse(result, join=join) else: return [ parse( self.tagger.parse(eojeol).decode('utf-8'), join=join, ) for eojeol in phrase.split() ] def morphs(self, phrase): """Parse phrase to morphemes.""" return [s for s, t in self.pos(phrase)] def nouns(self, phrase): """Noun extractor.""" tagged = self.pos(phrase) return [s for s, t in tagged if t.startswith('N')]
#!/usr/bin/env python # -*- coding: utf-8 -*- from glob import glob files = glob("*.txt") from MeCab import Tagger mecab = Tagger("-F%f[6]\\t%m\\n -E\ ") def get_first(s): s = s.split("\t") if len(s) < 2: return '' return s[0] or s[1] for filename in files: with open(filename) as f: lines = [list(filter(lambda s: s, [get_first(s) for s in mecab.parse(line).split('\n')])) for line in f] for word_n in range(1, 4): from collections import defaultdict d = defaultdict(int) tokens = 0 for line in lines: tokens += max(0, len(line) - word_n + 1) for i in range(0, len(line) - word_n + 1): d["".join(line[i:i+word_n])] += 1 with open(filename + "." + str(word_n) + "word", mode="w") as f_nword: sum = 0