def word_frequencies(text): from manabi.apps.reading_level.word_frequencies import WORD_FREQUENCIES mecab = MeCab() frequencies = [] for node in mecab.parse(text.encode('utf8'), as_nodes=True): frequency = WORD_FREQUENCIES.get(node.surface.decode('utf8')) if frequency is None: continue frequencies.append(frequency) return frequencies
class MecabTagger(object): """docstring, MecabTagger""" # TAGSET = set("""NNG NNP NNB NNBC NR NP VV VA VX VCP VCN MM MAG MAJ IC # JKS JKC JKG JKO JKB JKV JKQ JX JC EP EF EC ETN ETM # XPN XSN XSV XSA XR SF SE SSO SSC SC SY SL SH SN # UNKNOWN EOS""".split()) def __init__(self, **kwargs): self.tagger = MeCab(kwargs) def __enter__(self): return self def __exit__(self, type, value, traceback): del self.tagger @staticmethod def tagged_tuple(node): surface = node.surface features = node.feature.split(',') first_pos = features[0].partition('+')[0] lemma = (features[7].partition('/')[0] if features[4].startswith('Inflect') else surface.lower()) return Word(decode(surface, True), decode(lemma, True), first_pos.decode('ascii'), node.cost) def parse(self, text): # follow NLTK naming return [MecabTagger.tagged_tuple(node) for node in self.tagger.parse(text.encode(settings.DEFAULT_ENCODING), as_nodes=True) if not node.is_eos()]
if __name__ == '__main__': path = '../data/hanreiDB' vocab = defaultdict(lambda: len(vocab)) # open the DB db = hanrei_db.SQLite3(path) cur = db.open_db() # read the data sql = "select id, syubunPart from hanrei where id<=150" # sql = u"select id, syubunPart, riyuPart from hanrei" rows = db.exe_to_db(cur, sql) train_data = [] test_data = [] nm = MeCab() for doc_id, syubunPart in rows: print "--------------" print "id:", doc_id # 改行、空白削除 syubunPart = re.sub(r'(\n|\t| | )', '', syubunPart) # 文分割 sensp = sensplit.SenSplit(syubunPart) syubun_list = sensp() for sentence in syubun_list: if sentence == '': continue morph_list = [] # 文を形態素で分割したリスト sentence = sentence.encode('utf_8') # unicode→str(utf-8) for n in nm.parse(sentence, as_nodes=True):
def main(): parser = argparse.ArgumentParser( description='Chainer example: convolutional seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=48, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', type=int, default=512, help='Number of units') parser.add_argument('--layer', '-l', type=int, default=6, help='Number of layers') parser.add_argument('--head', type=int, default=8, help='Number of heads in attention mechanism') parser.add_argument('--dropout', '-d', type=float, default=0.1, help='Dropout rate') parser.add_argument('--model', type=str, help='trained model') parser.add_argument('--input', '-i', type=str, default='./', help='Input directory') parser.add_argument('--source', '-s', type=str, default='europarl-v7.fr-en.en', help='Filename of train data for source language') parser.add_argument('--target', '-t', type=str, default='europarl-v7.fr-en.fr', help='Filename of train data for target language') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--source-vocab', type=int, default=40000, help='Vocabulary size of source language') parser.add_argument('--target-vocab', type=int, default=40000, help='Vocabulary size of target language') parser.add_argument('--no-bleu', '-no-bleu', action='store_true', help='Skip BLEU calculation') parser.add_argument('--use-label-smoothing', action='store_true', help='Use label smoothing for cross entropy') parser.add_argument('--embed-position', action='store_true', help='Use position embedding rather than sinusoid') parser.add_argument('--use-fixed-lr', action='store_true', help='Use fixed learning rate rather than the ' + 'annealing proposed in the paper') parser.add_argument('--disable-mecab', '--dm', action='store_true', help='disalbe mecab toknize') args = parser.parse_args() print(json.dumps(args.__dict__, indent=4)) # Check file en_path = os.path.join(args.input, args.source) source_vocab = ['<eos>', '<unk>', '<bos>'] + \ preprocess.count_words(en_path, args.source_vocab) source_data = preprocess.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, args.target) target_vocab = ['<eos>', '<unk>', '<bos>'] + \ preprocess.count_words(fr_path, args.target_vocab) # print('Original training data size: %d' % len(source_data)) # print('Filtered training data size: %d' % len(train_data)) source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} m = MeCab('-Owakati') # Define Model model = net.Transformer(args.layer, min(len(source_ids), len(source_words)), min(len(target_ids), len(target_words)), args.unit, h=args.head, dropout=args.dropout, max_length=500, use_label_smoothing=args.use_label_smoothing, embed_position=args.embed_position) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) chainer.serializers.load_npz(args.model, model) def translate_one(source, target): words = preprocess.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array([source_ids.get(w, 1) for w in words], 'i') ys = model.translate([x], beam=5)[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) def tokenize(source, target): if args.disable_mecab: return source, target return m.parse(source), m.parse(target) while True: source = input('source> ') target = input('target> ') source, target = tokenize(source, target) translate_one(source, target)
def __init__(self, vocab: Vocab): self.vocab = vocab MeCab = try_mecab_import() # type: ignore[func-returns-value] self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def tokenize(mecab: MeCab, sentence: str) -> List[str]: return [ node.surface for node in mecab.parse(sentence, as_nodes=True) if node.surface ]
def _non_wrapped_insert_mode(self, session, *, is_develop_mode=True): with MeCab() as mecab: manalysis.insert(session, mecab, is_develop_mode=is_develop_mode) morph.insert(session, is_develop_mode=is_develop_mode)
def __init__(self, **kwargs): self.tagger = MeCab(kwargs)
# -*- coding: utf-8 -*- from natto import MeCab mc = MeCab() # テキストは cookbiz.jp より text = "お仕事については基本的には店舗に配属してからのOJTが中心となりますが、先輩スタッフがしっかりとサポートしてくれるので、どなたも安心してお仕事していただけます。2013年には本社内に開発室を設置。店舗配属前にもトレーニングを行なってから実際の店舗に配属されるなど、サポート体制がしっかりと整っているのも当社の魅力。実際、経験が浅い方や未経験スタートのスタッフも多数活躍中!" print ('Input text:\n'+text) print('====================================================') # -F / --node-format オプションでノードの出力フォーマットを指定する # # %m ... 形態素の表層文 # %f[0] ... 品詞 # %h ... 品詞 ID (IPADIC) # %f[8] ... 発音 # words = [] with MeCab('-F%m,%f[0],%h') as nm: for n in nm.parse(text, as_nodes=True): node = n.feature.split(','); if len(node) != 3: continue if node[1] == '名詞': # if True: words.append(node[0]) print(words)
node.surface, #表層 feature[0], #品詞1 feature[1], #品詞2 feature[2], #品詞3 feature[3], #品詞4 feature[6], #原型 node.cost, #コスト node.posid #品詞番号 ], index=df.columns) df = df.append(series, ignore_index=True) return df if __name__ == '__main__': parser = MeCab( "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd") #parser = MeCab() nodes = parser.parse("私は人間です。", as_nodes=True) for node in nodes: if not node.is_eos(): print(node.surface, " : ", node.feature, node.cost, node.posid) text = """ 私は人間です。 呼吸と食事ができます。 私は望遠鏡で泳ぐ少女を見た。 """ df = mecab_parse2df(text, parser) print(df)
def __init__(self): self.mc = MeCab()
# author: alex from natto import MeCab # 31 + 32 with open("verbs.txt", "w+"): pass text = open("neko.txt", "r+") res_file = open("verbs.txt", "a+") reader = text.readlines() for line in reader: with MeCab('-F%f[0],%f[6]') as nm: for n in nm.parse(line, as_nodes=True): if not n.is_eos() and n.is_nor(): klass, word = n.feature.split(',', 1) if klass in ['動詞']: #['名詞', '形容詞', '形容動詞','動詞']: print word res_file.write(word + ' ') res_file.write('\n') text.close() res_file.close() # 33 with open("neko_hen.txt", "w+"): pass text = open("neko.txt", "r+") res_file = open("neko_hen.txt", "a+") reader = text.readlines() for line in reader: with MeCab('-F%f[1],%f[6]') as nm: for n in nm.parse(line, as_nodes=True): if not n.is_eos() and n.is_nor():
""" Sample program for perse sites sentences to generate keywords for them. """ from natto import MeCab nm = MeCab() text = "this is a test" print nm.parse(text)
# -*- coding:utf-8 -*- from natto import MeCab import collections as cl import random from bs4 import BeautifulSoup import Lib_py.params as params import Lib_py.study as study # global instance mc_g = MeCab('-F%m,%f[0],%h') def get_next_search_candidate(html_src): target_str = get_str_from_http(html_src) if len(target_str) == 0: raise ("can't extract strings from html source") parsed_words_l = get_words_list(target_str) print(parsed_words_l) study.dump_get_words(parsed_words_l) common_words_l = study.get_common_words() common_removed_words_l = [ w for w in parsed_words_l if not w in common_words_l ] freq_words_l = get_freq_words_list(common_removed_words_l) return select_next_search_word_candidate(freq_words_l) def get_str_from_http(html_src): soup = BeautifulSoup(html_src)
# -*- coding: utf-8 -*- from natto import MeCab from numpy import array from gensim import corpora, matutils nm = MeCab('-F%m,%f[0],%h') def get_meishi(sentence): """名詞だけを取り出す。 :param sentence: String :return words: list of String. 入力例) get_meishi("ピンチ、ピンチの時には必ずヒーローが現れる。") ==> ['ピンチ', 'ピンチ', '時', 'ヒーロー'] """ # -F / --node-format オプションでノードの出力フォーマットを指定する # # %m ... 形態素の表層文 # %f[0] ... 品詞 # %h ... 品詞 ID (IPADIC) # %f[8] ... 発音 # words = [] for n in nm.parse(sentence, as_nodes=True): node = n.feature.split(',') if len(node) != 3: continue if node[1] == '名詞':
import os from natto import MeCab # bug of wordsを作成するためのライブラリ読み込み from gensim import corpora, matutils mc = MeCab() txt_word_list = [] # テキストファイルを格納しているフォルダを読み込み files = os.listdir(os.path.dirname(__file__)+'/path/txt') # フォルダ配下のテキストファイルを1つずつ読み込み for file in files: # テキストファイルから名詞と動詞の単語を取り出したリスト作成(Q11-1の処理と同じ) with open(os.path.dirname(__file__) + '/path/txt/'+file, 'r') as f: txt = f.read() word_list = [] for n in mc.parse(txt, as_nodes=True): if not (n.is_bos() or n.is_eos()): part, word = n.feature.split(',', 1) if part == "名詞" or part == "動詞": word_list.append(n.surface) # テキストファイルごとの単語リストを追加 txt_word_list.append(word_list) # bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成 corpus_dic = corpora.Dictionary(txt_word_list) # 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換
def main(): nm = MeCab('-Owakati') word = "MeCabは 京都大学情報学研究科−日本電信電話株式会社コミュニケーション科学基礎研究所 共同研究ユニットプロジェクトを通じて開発されたオープンソース 形態素解析エンジンです。 言語, 辞書,コーパスに依存しない汎用的な設計を 基本方針としています。 パラメータの推定に Conditional Random Fields (CRF) を用 いており, ChaSenが採用している 隠れマルコフモデルに比べ性能が向上しています。また、平均的に ChaSen, Juman, KAKASIより高速に動作します。 ちなみに和布蕪(めかぶ)は, 作者の好物です。" print(nm.parse(word)) lis = [n.surface for n in nm.parse(word, as_nodes=True) if n.is_nor()] print(lis)
# coding:utf-8 import pandas as pd from natto import MeCab mc = MeCab() select = pd.read_csv('./input/keyword.csv', encoding='SHIFT-JIS', header=None) select = select[0].values.tolist() tango_retu = [] score_retu = [] # 日本語評価極性辞書(用言編)ver.1.0(2008年12月版) # ポジの用語は 1 ,ネガの用語は -1 と数値化する with open("./dictionary/wago.121808.pn.txt", 'r') as f: for l in f.readlines(): l = l.split('\t') l[1] = l[1].replace(" ", "").replace('\n', '') value = 1 if l[0].split('(')[0] == "ポジ" else -1 tango_retu.append(l[1]) score_retu.append(value) wago_dic = dict(zip(tango_retu, score_retu)) tango_retu = [] score_retu = [] # 日本語評価極性辞書(名詞編)ver.1.0(2008年12月版) # pの用語は 1 eの用語は 0 ,nの用語は -1 と数値化する with open("./dictionary/pn.csv.m3.120408.trim", 'r') as f: for l in f.readlines():
# -*- coding: utf-8 -*- from sklearn.feature_extraction.text import CountVectorizer from natto import MeCab _morpheme_type = ['NNG', 'NNP'] _escape_pattern = ['\n'] _nm = MeCab() def filter_by_type(text): terms = [] for term_info in str(_nm.parse(text)).split('\n'): _term_info = term_info.split('\t') if len(_term_info) < 2: continue surface = _term_info[0] analysis = _term_info[1].split(',') if analysis[0] in _morpheme_type: terms.append(surface) return terms def generate_corpus2(data_path): _corpus = [] fp = open(data_path, 'r') for line in fp.readlines(): if line not in _escape_pattern: terms = filter_by_type(line) _corpus.append(' '.join(terms))
from natto import MeCab import os from gensim import corpora mc = MeCab() txt_list = [] files = os.listdir(os.path.dirname(__file__) + '/path/txt') for file in files: with open(os.path.dirname(__file__) + '/path/txt/' + file, 'r') as f: txt = f.read() word_list = [] for n in mc.parse(txt, as_nodes=True): if not (n.is_bos() or n.is_eos()): part, word = n.feature.split(',', 1) if part == "名詞" or part == "動詞": word_list.append(n.surface) txt_list.append(word_list) dictionary = corpora.Dictionary(txt_list) corpus_list = [dictionary.doc2bow(txt) for txt in txt_list] # 下の行から本書スタート from gensim import matutils, models # corpus_listを準備するコードは省略 # TF-IDFのモデルを生成 tfidf_model = models.TfidfModel(corpus_list, normalize=True) # corpusにTF-IDFを適用
# MeCabをPythonで利用するためのライブラリ読み込み import os from natto import MeCab # merosには、メロスの文章データが格納 # MeCabを実行するオブジェクトを生成 mc = MeCab() # 下記のコードはテキスト時は、下記のようにする with open(os.path.dirname(__file__) + '/path/txt/meros.txt', 'r') as f: txt = f.read() word_list = [] # MeCabを用いて、形態素解析を実行 for part_and_word in mc.parse(txt, as_nodes=True): # 形態素解析結果のpart_and_wordが開始/終了オブジェクトでないことを判定 if not (part_and_word.is_bos() or part_and_word.is_eos()): # 形態素解析結果から品詞と単語を取得 part, word = part_and_word.feature.split(',', 1) # 名詞と動詞の単語を抽出 if part == '名詞' or part == '動詞': word_list.append(part_and_word.surface)
# %% from IPython.display import HTML from natto import MeCab nm = MeCab() a = "" text = "こんにちは!野球は走る、打つ、投げるスポーツです。" print(text) with MeCab('-F%m,%f[0],%h,%f[8]') as nm: for n in nm.parse(text, as_nodes=True): lis = n.feature.split(",") try: if lis[1] == "動詞": b = ( "<span style='background-color:#ffcc99'>{0}</span>".format( lis[0])) else: b = ( "<span style='background-color:#ffffff'>{0}</span>".format( lis[0])) a = a + b except IndexError: pass display(HTML(a))