def __init__(self, mode: int = PlayMode.HIRA_ZHUYIN): self.mecab = MeCab.Tagger() # -Oyomi -Owakati self.Mode = mode
import linecache import random import MeCab def get_byte_num(s): return len(s.encode('utf-8')) if __name__ == '__main__': random.seed(42) filename = 'tmp.txt' save_file = 'ja.text8' # LIMIT_BYTES = 100000000 t = MeCab.Tagger('-Owakati') num_lines = sum(1 for line in open(filename)) indices = list(range(num_lines)) random.shuffle(indices) with open(save_file, 'w') as f: count_byte = 0 for i in indices: print('{} bytes'.format(count_byte)) text = linecache.getline(filename, i) text = text.strip() text = t.parse(text).strip() f.write(text) count_byte += get_byte_num(text) # if count_byte >= LIMIT_BYTES: # break
""" 韩语分词 """ import MeCab mecab = MeCab.Tagger("-Owakati") with open("/Users/ff/Desktop/测评数据/去空格/ko_chosun_test.txt", 'r', encoding='utf-8') as f_in: with open("/Users/ff/Desktop/测评数据/去空格/ko_chosun_test_split.txt", 'w', encoding='utf-8') as f_out: for sentence in f_in: mecab.parse(sentence) # print(mecab.parse(sentence)) f_out.write(str(mecab.parse(sentence)).strip()) f_out.write('\n') print("Finish line") # from janome.tokenizer import Tokenizer as janome_tokenizer # # with open("/Users/ff/Desktop/train_data/jp/jp_web.txt", 'r', encoding='utf-8') as f_in: # with open("/Users/ff/Desktop/train_data/jp/jo_web_split_token3.txt", 'w', encoding='utf-8') as f_out: # for sentence in f_in: # # sentence = "日本人のものと見られる、延べ2億件のメールアドレスとパスワードが闇サイトで販売されていたことがわかりました。過去に漏えいしたデータを集めたものと見られ、調査に当たったセキュリティー企業は、日本を狙ったサイバー攻撃のきっかけになるおそれがあるとして注意を呼びかけています。" # token_object = janome_tokenizer() # alist = [x.surface for x in token_object.tokenize(sentence)] # print(" ".join(alist)) # f_out.write(" ".join(alist).strip()) # f_out.write('\n')
def build_vocabulary_per_proc(sentence_files, total_proc, vocab_dict_file, proc_no): # NGワード(STOPワード)をファイルから読み込む。 NG_WORD_LIST = [] with open(G.NG_WORD_LIST, mode='rt') as f: NG_WORD_LIST = list(f) tmp_sentence_files = [] dict_file_no = 0 mecab = MeCab.Tagger('-d /usr/lib/mecab/dic/mecab-ipadic-neologd') mecab.parse('') # センテンスファイル数が総プロセス数より少ない場合、総プロセス数をセンテンスファイル数と同値とする。 if len(sentence_files) < total_proc: total_proc = len(sentence_files) processed_files = 0 # 文書ファイルのパスをプロセス数分だけ読み込み、自分のプロセス番号に該当するファイルを処理する。 for tmp_sentence_file in sentence_files: tmp_sentence_files.append(tmp_sentence_file) # プロセス数分のファイルパスを読み込んだら、自分のプロセス番号のファイルを処理する。 if len(tmp_sentence_files) == total_proc or ( len(sentence_files) - processed_files) < total_proc: processed_files += total_proc if len(tmp_sentence_files) < (proc_no) + 1: break # 処理対象の文書ファイルパスを取得し、その分かち書きした結果を格納するファイルパスを作成する。 sentence_file = tmp_sentence_files[proc_no] sentence_wakati_file = wakati_files_dir + os.path.basename( sentence_file) + '_wakati' tmp_sentence_files = [] vocabulary = dict() print("proc_no:{} file:{}".format(proc_no, sentence_file)) with open(sentence_wakati_file, mode='w') as swf: sentences = Sentences(sentence_file) print("Generating Vocabulary from the sentences") train_words = 0 sentence_procs = [] counter = 0 for sentence in sentences: wakati_line = [] counter += 1 if counter % 100000 == 0: print("proc:{} counter:{}".format(proc_no, counter)) node = mecab.parseToNode(sentence) while node: word = node.surface wakati_line.append(word) # NGワードはスキップする。 if word in NG_WORD_LIST: node = node.next continue # min_charで指定された文字数未満の場合スキップする。 if len(word) < G.min_char: node = node.next continue # 一文字の数字・アルファベットはスキップする。 if len(word) == 1 and re.match('[a-xA-Z0-9]', word): node = node.next continue pos1 = node.feature.split(',')[0] pos2 = node.feature.split(',')[1] #数以外の名詞、最小文字数を超える動詞形容詞は学習する。 if (pos1 == '名詞' and pos2 != '数' and pos2 != '非自立') or \ (pos1 == '動詞' and pos2 == '自立' and len(word) > G.min_char) or \ (pos1 == '形容詞' and len(word) > G.min_char): vocabulary.setdefault(word, 0) vocabulary[word] += 1 train_words += 1 node = node.next swf.write(' '.join(wakati_line) + '\n') print("Vocabulary size = %d" % len(vocabulary)) print("Total words to be trained = %d" % train_words) with open(vocab_dict_file + '_' + str(dict_file_no), 'wb') as f: pickle.dump(vocabulary, f) dict_file_no += 1
# return df[df.group_id==row.group_id].loc[:,['group_id', 'domain', 'surface']] # print(search_synonyms('巨人')) # print(search_synonyms('Amazon')) ########################################################## ######################################################### # BoWとDoc2Vecの結果の比較をしてみる # https://qiita.com/kaki_1900/items/474bf00c0720af1ff1bf from os.path import normpath, dirname, join import os import MeCab import unicodedata import neologdn tagger = MeCab.Tagger() def tokenize(text): text = unicodedata.normalize('NFKC', text) # <1>unicode正規化 text = neologdn.normalize(text) # <2>neologdn正規化(全角半角) text = text.lower() # <3>小文字に統一 node = tagger.parseToNode(text) result = [] while node: features = node.feature.split(',') if features[0] != 'BOS/EOS': if features[0] not in ['助詞', '助動詞', '記号']: # <4>ストップワード除去 token = features[6] \
# coding: utf-8 import MeCab mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') INPUT_FILE_PATH = "./scraping.csv" OUTPUT_FILE_PATH = "./mecab.txt" with open(INPUT_FILE_PATH) as f: text = f.read() # mecab.parse('')#文字列がGCされるのを防ぐ node = mecab.parseToNode(text) while node: #単語を取得 word = node.surface #品詞を取得 pos = node.feature.split(",")[1] tmp_str = '{0} , {1}\n'.format(word, pos) with open(OUTPUT_FILE_PATH, mode='a') as f: f.write(tmp_str) #次の単語に進める node = node.next
_START_VOCAB = [_PAD, _GO, _EOS, _UNK] PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 # Regular expressions used to tokenize. _WORD_SPLIT = re.compile(b"([.,!?\"':;)(])") _DIGIT_RE = re.compile(br"\d") # URLs for WMT data. _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar" _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz" tagger = MeCab.Tagger("-Owakati") def maybe_download(directory, filename, url): """Download filename from url unless it's already in directory.""" if not os.path.exists(directory): print("Creating directory %s" % directory) os.mkdir(directory) filepath = os.path.join(directory, filename) if not os.path.exists(filepath): print("Downloading %s to %s" % (url, filepath)) filepath, _ = urllib.request.urlretrieve(url, filepath) statinfo = os.stat(filepath) print("Succesfully downloaded", filename, statinfo.st_size, "bytes") return filepath
def __init__(self): self.tagger = MeCab.Tagger()
def wakati(text): t = MeCab.Tagger("-Owakati") m = t.parse(text) result = m.rstrip(" \n").split(" ") return result
keywords = ["アレルギー"] if text.find(keywords[0]) != -1: return 1 else: return 0 if __name__ == "__main__": param = sys.argv f = open(param[1], "r") texts = f.read() f.close() #Parsing won't help so fat m = MeCab.Tagger("-Owakati") #print parse_text(texts, m) #for i in dialogue: #print i.encode('utf-8') + separater #Stop wordsで区切る方法 num = len(texts) stop_words = "すか たか" start_words = "です" seped = re.split('すか|です|たか|した', texts) #seped = re.split('すか|たか', texts) m = MeCab.Tagger("-Owakati") rand_id = random.randint(0, 100)
def tokenize(text): wakati = MeCab.Tagger("-Owakati") wakati.parse("") return wakati.parse(text).strip().split()
# -*- coding: utf-8 -*- import os import sys import re from gensim import corpora, matutils import MeCab import time DATA_DIR_PATH = './data/text/' DICTIONARY_FILE_NAME = 'livedoordic.txt' mecab = MeCab.Tagger('mecabrc') def get_class_id(file_name): dir_list = get_dir_list() dir_name = next(filter(lambda x: x in file_name, dir_list), None) if dir_name: return dir_list.index(dir_name) return None def get_dir_list(): tmp = os.listdir(DATA_DIR_PATH) if tmp is None: return None return sorted([x for x in tmp if os.path.isdir(DATA_DIR_PATH + x)]) def get_file_content(file_path): with open(file_path, encoding='utf-8') as f: return ''.join(f.readlines()[2:]) # ライブドアコーパスが3行目から本文はじまってるから
import MeCab from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import argparse import smart_open import re parser = argparse.ArgumentParser() parser.add_argument('faq', type=str) #parser.add_argument('model', type=str) parser.add_argument("--dictionary", "-d", type=str, help="mecab dictionary") parser.add_argument("--stop_words", "-s", type=str, help="stop words list") args = parser.parse_args() mecab = MeCab.Tagger("-Owakati" + ("" if not args.dictionary else " -d " + args.dictionary)) questions = [] sentences = [] originals = [] j = 0 for line in open(args.faq, "r", encoding="cp932"): #utf-8 cols = line.strip().split('\n') #t questions.append( gensim.utils.simple_preprocess(mecab.parse(cols[0]).strip(), min_len=1)) #1 originals.append(cols[0]) sentences.append( models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess( mecab.parse(cols[0]).strip(), min_len=1), tags=["SENT_" + str(j)]))
""" 取得したツイートをWordCloudで可視化 """ import csv import MeCab from wordcloud import WordCloud # 参照:https://qiita.com/berry-clione/items/b3a537962c84244a2a09 dicdir = '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd' tagger = MeCab.Tagger(dicdir) with open("./output/tweet_data", "r") as f: reader = csv.reader(f, delimiter="\t") texts = [] for row in reader: texts.append(row) # 4パターンのWordCloudを作成 patterns = [[["名詞", "動詞", "形容詞"], "all"], [["名詞"], "noun"], [["動詞"], "verb"], [["形容詞"], "adjective"]] # 形態素解析(Mecab) -> WordCloud 処理 for pattern in patterns: words = [] for text in texts: text = " ".join(text) text = text.split("http")[0] # http 以降はトリ(URLは最後に載せるパターンが多いため) node = tagger.parseToNode(text) while node: if node.feature.split(",")[0] in pattern[0]:
import MeCab from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.preprocessing import LabelEncoder import dill # MeCabの初期化 mecab = MeCab.Tagger() mecab.parse('') sents = [] labels = [] # generate-samples.txt の出力である samples.dat の読み込み for line in open("da_samples.dat","r"): line = line.rstrip() # samples.dat は対話行為タイプ,発話文,タグとその文字位置が含まれている da, utt = line.split('\t') words = [] for line in mecab.parse(utt).splitlines(): if line == "EOS": break else: # MeCabの出力から単語を抽出 word, feature_str = line.split("\t") words.append(word) # 空白区切りの単語列をsentsに追加 sents.append(" ".join(words)) # 対話行為タイプをlabelsに追加 labels.append(da)
import re import urllib.request import MeCab import mojimoji from pathlib import Path tagger = MeCab.Tagger( "-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd") def get_stopwords(save_dir="input/") -> set: path = Path(save_dir) / "stopwords.txt" if path.exists(): with open(path) as f: stopwords = f.read().split("\n") return set(stopwords) url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt" stopwords = urllib.request.urlopen(url).read().decode("utf8") with open(path, "w") as f: f.write(stopwords) return set(stopwords.split("\n")) def tokenizer(x: str, stopwords: set, include_verb=True) -> str: text = mojimoji.zen_to_han(x.replace("\n", ""), kana=False) parsed = tagger.parse(text).split("\n") parsed = [t.split("\t") for t in parsed] parsed = list( filter(
def converter(word): m = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd") return m.parse(word).split('\t')[1].split(',')[6]
wtype2 = word.split('\t')[1].split(',')[1] #品詞細分類1 #名詞はそのまま,形容詞、動詞、副詞は原型を使用する if wtype == "名詞" and wtype2 in norns: return (word.split('\t')[0]) elif wtype == "形容詞" and wtype2 in ["自立", "非自立"]: return (word.split('\t')[1].split(',')[6]) elif wtype == "動詞" and wtype2 == "自立": return (word.split('\t')[1].split(',')[6]) elif wtype == "副詞": return (word.split('\t')[1].split(',')[6]) if __name__ == "__main__": p = Path(__file__).parent.resolve() / "toots_log" file_paths = [f for f in p.iterdir()] m = MeCab.Tagger("-d /usr/lib/mecab/dic/mecab-ipadic-neologd") words = [] #使用する品詞細分類1のリスト for file_path in tqdm(file_paths): with open(file_path, "r") as f: text = f.read() #カスタム絵文字を取り除く text = re.sub(r":[a-zA-Z0-9_-]+:", "", text) #分かち書きを行い単語の品詞が形容詞、動詞、名詞、副詞のみを取得する words.extend( [get_word(word) for word in m.parse(text).splitlines()[:-1]]) words = collections.Counter(words) words = pd.DataFrame(words, index=["num"]) words = words.T words = words[words["num"] > 199]
def onExecute(self, ec_id): while self._xmldataIn.isNew(): data = self._xmldataIn.read() data.data = data.data.decode('utf-8') speechdata = BeautifulSoup(data.data, "lxml") totaldata = [] for data.data in speechdata.findAll('data'): rank = int(data.data['rank']) score = float(data.data['score']) text = data.data['text'] xmldata = XMLSet(rank, score, text.encode("utf-8")) totaldata.append(xmldata) totalxmldata = sorted(totaldata, key=lambda x: x[1], reverse=True) print str(totalxmldata).decode('string-escape') highxmldata = totalxmldata[0] print(highxmldata[2]) intextdata = highxmldata[2] taggerwakati = MeCab.Tagger("-Owakati") data_wakati = taggerwakati.parse(intextdata) list_wakati = data_wakati.split(' ') print str(list_wakati).decode('string-escape') self._d_wakati.data = list_wakati self._wakatiOut.write() taggerchasen = MeCab.Tagger("-Ochasen") taggerchasen.parse('') node = taggerchasen.parseToNode(intextdata) chasendata = [] while node: resorg = node.feature.split(",")[6] ps = node.feature.split(",")[0] if ps == "名詞": chasendata.append(resorg) if ps == "動詞": chasendata.append(resorg) if ps == "形容詞": chasendata.append(resorg) if ps == "副詞": chasendata.append(resorg) if ps == "助詞": chasendata.append(resorg) if ps == "接続詞": chasendata.append(resorg) if ps == "助動詞": chasendata.append(resorg) if ps == "連体詞": chasendata.append(resorg) if ps == "感動詞": chasendata.append(resorg) node = node.next chasendata.append("\n") print str(chasendata).decode('string-escape') self._d_chasen.data = chasendata self._chasenOut.write() while self._strdataIn.isNew(): intext = self._strdataIn.read() intextdata = intext.data taggerwakati = MeCab.Tagger("-Owakati") data_wakati = taggerwakati.parse(intextdata) list_wakati = data_wakati.split(' ') print str(list_wakati).decode('string-escape') self._d_wakati.data = list_wakati self._wakatiOut.write() taggerchasen = MeCab.Tagger("-Ochasen") taggerchasen.parse('') node = taggerchasen.parseToNode(intextdata) chasendata = [] while node: resorg = node.feature.split(",")[6] ps = node.feature.split(",")[0] if ps == "名詞": chasendata.append(resorg) if ps == "動詞": chasendata.append(resorg) if ps == "形容詞": chasendata.append(resorg) if ps == "副詞": chasendata.append(resorg) if ps == "助詞": chasendata.append(resorg) if ps == "接続詞": chasendata.append(resorg) if ps == "助動詞": chasendata.append(resorg) if ps == "連体詞": chasendata.append(resorg) if ps == "感動詞": chasendata.append(resorg) node = node.next chasendata.append("\n") print str(chasendata).decode('string-escape') self._d_chasen.data = chasendata self._chasenOut.write() return RTC.RTC_OK
from os import path from typing import Optional from flask import Flask, abort, request, Response import MeCab CONFIG_PATH = path.join(path.dirname(path.abspath(__file__)), 'flask.cfg') DIC_DIR = path.join('/', 'usr', 'local', 'lib', 'mecab', 'dic') # Flask Application app = Flask(__name__) app.config.from_pyfile(CONFIG_PATH) # MeCab mecab = MeCab.Tagger(f"-d {path.join(DIC_DIR, 'mecab-ipadic-neologd')}") @app.route('/', methods=['GET', 'POST']) def parse(): """Morphological Analysis by MeCab. Request Format: GET: /?sentence=アルミ缶の上にあるみかん POST: / -X "Content-Type: application/json" { "sentence": "アルミ缶の上にあるみかん" } """ # STEP.1 Extraction of a given sentence sentence: Optional[str] = None try: if request.method == 'POST': sentence = request.json['sentence']
# coding: UTF-8 import MeCab fin = open('neko.txt.mecab') lines = fin.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる) fin.close() # lines: リスト。要素は1行の文字列データ tagger = MeCab.Tagger("-Ochasen") sentence = [] morpheme_set = [] morpheme = {} for line in lines: morpheme_list = line.split('\t') surface = morpheme_list[0] if surface == "EOS\n": morpheme_set.append(sentence) sentence = [] else: morpheme["surface"] = surface feature = morpheme_list[1].split(',') morpheme["base"] = feature[6] morpheme["pos"] = feature[0] morpheme["pos1"] = feature[1] sentence.append(morpheme.copy()) for sentence in morpheme_set: for morpheme in sentence: if (morpheme["pos"] == "動詞"):
#!/usr/bin/env python # -*- coding: utf-8 -*- import unicodedata import nltk import MeCab MECAB = MeCab.Tagger("-Owakati") JA_SYMBOLS = u'!?!?。☆★♡♥❤♪♬♫✿' JA_TOKENIZER = nltk.tokenize.RegexpTokenizer(u'[^{0}]*([{0}]+|$)'.format(JA_SYMBOLS)) EN_SYMBOLS = u'☆★♡♥❤♪♬♫✿' EN_TOKENIZER = nltk.tokenize.RegexpTokenizer(u'[^{0}]*([{0}]+|$)'.format(EN_SYMBOLS)) def normalize(s): return unicodedata.normalize('NFKC', s) def sent_tokenize_ja(s): '''returns a list of strings''' return JA_TOKENIZER.tokenize(s)[:-1] def sent_tokenize_en(s): '''returns a list of strings''' sentences = EN_TOKENIZER.tokenize(s)[:-1] sentencess = map(lambda sent: nltk.sent_tokenize(sent.strip()), sentences) return [sentence for sentences in sentencess for sentence in sentences] def sent_tokenize(s, lang=None): if lang == 'en': return sent_tokenize_en(normalize(s))
import logging from datetime import datetime import urllib from requests_oauthlib import OAuth1Session import MeCab import markovify # Image draw import io from PIL import Image, ImageDraw, ImageFont from . import generate_model logger = logging.getLogger('django') mec = MeCab.Tagger("-r /dev/null -d /usr/lib/mecab/dic/mecab-ipadic-neologd -O wakati") class AuthRedirectAPIView(APIView): def get(self, request): if 'callback' not in request.query_params: return Response( { 'message': 'callback URL not specified!' }, status.HTTP_400_BAD_REQUEST ) oauth = OAuth1Session(settings.TWITTER_API_CONKEY, settings.TWITTER_API_CONSEC, None, None, request.query_params['callback']) oauth.fetch_request_token("https://api.twitter.com/oauth/request_token") url = oauth.authorization_url("https://api.twitter.com/oauth/authenticate") return redirect(url)
import MeCab m = MeCab.Tagger() out = m.parse("안녕하세요") print(out)
import MeCab wakati = MeCab.Tagger('-Owakati') #分かち書き neo_wakati = MeCab.Tagger( '-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #追加辞書を適用 word = input("分かち書き:") wakati = wakati.parse(word).strip() neo_wakati = neo_wakati.parse(word).strip() print('通常辞書:' + wakati) print('追加辞書:' + neo_wakati)
import tensorflow as tf from seq2seq_model import Seq2SeqModel import json import MeCab import os import numpy as np from util import parse_file, sentence_to_word_id, create_buckets, _buckets, EOS, ignore_list tagger = MeCab.Tagger("mecabrc") id2word = json.load(open("dictionary_i2w.json", "r")) word2id = json.load(open("dictionary_w2i.json", "r")) _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] # 学習データと辞書の取得 questions, answers, _, _ = parse_file("../data/conversation_data.txt") # 文章をidの配列に変換する print(questions) ids_questions = sentence_to_word_id(questions, word2id=word2id) print(ids_questions) vocab_size = len(word2id) + 3 print(vocab_size) ckpt = tf.train.get_checkpoint_state("./tmp") print(ckpt) print(tf.train.checkpoint_exists("./tmp/model.ckpt-5000")) with tf.Session() as sess: print('init model')
import MeCab mecab = MeCab.Tagger('-Ochasen') print(mecab.parse('このソフトクリームとってもおいしくない')) #mecabrc:(引数なし) #-Ochasen: (ChaSen 互換形式) #-Owakati: (分かち書きのみを出力) #-Oyomi: (読みのみを出力)
def sort_random_list(self, phraseList): words = "" # random.shuffle(phraseList) for phrase in phraseList: for word in phrase: words = words + word return words # test if __name__ == '__main__': maker = Maker() m = MeCab.Tagger( r'-Owakati -d C:\Users\hori\workspace\encoder-decoder-sentence-chainer-master\mecab-ipadic-neologd' ) """ parser = Parser() wordsList = [] sentenceList = [] for i in range(20000): index = random.randint(1, 18) words, sentence = maker.generate_word_sentence(index) words = m.parse(words)[:-2].split(' ') sentence = m.parse(sentence)[:-2].split(' ') for i in range(len(words)): w = words[i] words[i] = parser.parse(w) for i in range(len(sentence)):
def parse(self, sentence): me = MeCab.Tagger() s = me.parse(sentence) return s
import MeCab mecab = MeCab.Tagger("-Ochasen") # テキストを引数として、形態素解析の結果、名詞・動詞・形容詞(原形)のみを配列で抽出する関数を定義 def extract_words(text): node = mecab.parseToNode(text) words = [] while node: word = node.feature.split(",")[6] #原形 word_type = node.feature.split(",")[0] #品詞 #print(word + ": " + word_type) if word_type in ["名詞", "動詞", "形容詞"]: words.append(word) #print(word) node = node.next return words # 関数テスト text = '三四郎は京都でちょっと用があって降りたついでに。誰かが困っている時に来るのです。' # 全体のテキストを句点('。')で区切った配列にする。 sentences = text.split('。') # それぞれの文章を単語リストに変換(処理に数分かかります) word_list = [extract_words(sentence) for sentence in sentences] # 結果の一部を確認 for word in word_list[1]: print(word)