def get_keitaiso_list_from_juman(text): """ textを形態素解析して返す mecabでできない表記揺れの問題をjumanだと解決できる """ jumanpp = Jumanpp() keitaiso_list = [] hinshi_list = [] exclusive_word_list = get_exclusive_word_list() # スペースがあるとエラー。先頭に#があると処理が動かなくなる(なんでだろう) text = text.replace(" ", "").replace(" ", "").replace("#", "/") result = jumanpp.analysis(unicode(text, 'utf-8')) # pyknp-Jumanではユニコード文字列しか処理されない try: for mrph in result.mrph_list(): keitaiso = mrph.genkei.encode('utf-8') hinshi = mrph.hinsi.encode('utf-8') # 形態素が設定した品詞リストやゴミワードリストに含まれるとき、数字のときにはスキップ if not is_valid_word_class(hinshi) or keitaiso in exclusive_word_list or keitaiso.isdigit(): continue keitaiso_list.append(keitaiso) hinshi_list.append(hinshi) except: print traceback.print_exc() return [keitaiso_list, hinshi_list]
def create_gensim_dictionary(data_path, no_below=2, no_above=0.1): for root, dirs, files in os.walk(data_path): print("# morphological analysis") docs = {} docs_title = {} for docname in files: docs[docname] = [] with open(os.path.join(data_path, docname), "r") as f: lines = f.readlines() docs_title[docname] = lines[0] for text in lines: text_replace = text.replace(" ", "").replace("\n", "").replace( "#", "").replace("@", "") if text_replace != "": result = Jumanpp().analysis(text_replace) for mrph in result.mrph_list(): if len(mrph.midasi) > 1: docs[docname].append(mrph.midasi) dictionary = gensim.corpora.Dictionary(docs.values()) dictionary.filter_extremes(no_below=no_below, no_above=no_above) return docs, docs_title, dictionary
def main(): if len(sys.argv) != 2: print('need one argument for a file.') return file_name = sys.argv[1] vocab_dict = defaultdict(int) juman = Jumanpp() with open(file_name, 'r', encoding='utf-8', newline='') as fr: text = fr.readlines() for line in text: # juman++ doesn't support half-width character line = line.replace(' ', ' ') line.translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) analysis = juman.analysis(line.replace('\n', '')) for m in analysis.mrph_list(): vocab_dict[str(m.midasi)] += 1 sorted_dict = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True) print(sorted_dict) print(len(sorted_dict))
def append_repname(words): """ :param words: a list of Word instances :return: a list of Word instances with preprocessed words with the representative expressions """ n_word = len(words) juman = Jumanpp() bar = progressbar.ProgressBar() for i in bar(range(n_word), max_value=n_word): word = words[i] if word.uid != i: continue # already merged repname_set = [] r = juman.analysis(word.p_surface) for mrph in r.mrph_list(): if mrph.bunrui == '数詞': repname_set.append([kansuji2arabic(mrph.midasi)]) elif mrph.repnames() != '': repname_set.append(mrph.repnames().split('?')) else: repname_set.append([mrph.midasi]) words[i].alias.extend(expand_ambiguity(repname_set)) return words
def read_and_anlyze_text(): sys.stdin = codecs.getreader('utf_8')(sys.stdin) sys.stdout = codecs.getwriter('utf_8')(sys.stdout) jumanpp = Jumanpp() midasis = [] repnames = [] repname_counts = {} wikipedia_redirections = [] w_rs = [] w_r_counts = {} row_result = [] while True: input_ = sys.stdin.readline() if input_ == '': break else: input_ = input_.strip() if input_ == '': continue result = jumanpp.analysis(input_) for mrph in result.mrph_list(): if not repname_counts.has_key(mrph.repname): repname_counts[mrph.repname] = 0 if (not mrph.midasi in midasis) and (mrph.repname != u""): repname_counts[mrph.repname] += 1 w_r = get_wikipedia_redirection(mrph.imis) if not w_r: w_r = mrph.midasi if not w_r_counts.has_key(w_r): w_r_counts[w_r] = 0 if (not mrph.midasi in midasis): w_r_counts[w_r] += 1 midasis.append(mrph.midasi) repnames.append(mrph.repname) wikipedia_redirections.append(w_r) w_rs.append(w_r) midasis.append("\n") repnames.append("\n") wikipedia_redirections.append(None) w_rs.append("\n") repname_counts["\n"] = 0 w_r_counts["\n"] = 0 row_result.append(result.spec()) yure_result = [] for i, midasi in enumerate(midasis): yure = False if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1: yure = True yure_result.append({ "midasi": midasi, "repname": repnames[i], "wikipedia_redirection": wikipedia_redirections[i], "repname_count": repname_counts[repnames[i]], "w_r_count": w_r_counts[w_rs[i]], "yure": yure }) return row_result, yure_result
def __init__(self): self.stop_path = str(pathlib.Path( __file__).resolve().parent) + '/data/stopwords_slothlib.txt' self.stopwords = [] with open(self.stop_path, 'r') as f: self.stopwords = f.read().split() # 形態素解析 self.jumanpp = Jumanpp()
def analysis_text(self, text, debug=None): jumanpp = Jumanpp() #There may be unknown error in jumanpp. what... try: result = jumanpp.analysis(text) except: return None if debug: self.__print_analyzed(result) return result
def read_and_anlyze_text(): sys.stdin = codecs.getreader('utf_8')(sys.stdin) sys.stdout = codecs.getwriter('utf_8')(sys.stdout) jumanpp = Jumanpp() midasis = [] repnames = [] repname_counts = {} wikipedia_redirections = [] w_rs = [] w_r_counts = {} row_result = [] while True: input_ = sys.stdin.readline() if input_ == '' : break else : input_ = input_.strip() if input_ == '' : continue result = jumanpp.analysis(input_) for mrph in result.mrph_list(): if not repname_counts.has_key(mrph.repname): repname_counts[mrph.repname] = 0 if (not mrph.midasi in midasis) and (mrph.repname != u"") : repname_counts[mrph.repname] += 1 w_r = get_wikipedia_redirection(mrph.imis) if not w_r : w_r = mrph.midasi if not w_r_counts.has_key(w_r): w_r_counts[w_r] = 0 if (not mrph.midasi in midasis): w_r_counts[w_r] += 1 midasis.append(mrph.midasi) repnames.append(mrph.repname) wikipedia_redirections.append(w_r) w_rs.append(w_r) midasis.append("\n") repnames.append("\n") wikipedia_redirections.append(None) w_rs.append("\n") repname_counts["\n"] = 0 w_r_counts["\n"] = 0 row_result.append(result.spec()) yure_result = [] for i, midasi in enumerate(midasis): yure = False if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1: yure = True yure_result.append({"midasi":midasi, "repname": repnames[i], "wikipedia_redirection": wikipedia_redirections[i], "repname_count": repname_counts[repnames[i]], "w_r_count": w_r_counts[w_rs[i]], "yure": yure}) return row_result, yure_result
def segment(texts): jumanpp = Jumanpp() results = {} for text in texts: try: parsed = jumanpp.analysis(han_to_zen(neologdn.normalize(text))) segmented = ' '.join(m.midasi for m in parsed.mrph_list()) results[text] = segmented except Exception: pdb.set_trace() logger.warning('Cannot parse {}'.format(text)) continue return results
def parser_juman(text): from pyknp import Jumanpp jumanpp = Jumanpp() result = jumanpp.analysis(text) words = [] for n in result.mrph_list(): if n.hinsi != '助詞' and n.hinsi != '助動詞' and n.hinsi != '特殊' and n.bunrui != "空白": if n.hinsi == '動詞': words.append(n.genkei) else: words.append(n.midasi) return words
class MorphAnalysis: def __init__(self): self.stop_path = str(pathlib.Path( __file__).resolve().parent) + '/data/stopwords_slothlib.txt' self.stopwords = [] with open(self.stop_path, 'r') as f: self.stopwords = f.read().split() # 形態素解析 self.jumanpp = Jumanpp() def to_wakati(self, text, allow_word_class=[ '名詞', '指示詞', '動詞', '形容詞', '判定詞', '助動詞', '副詞', '助詞', '接続詞', '連体詞', '感動詞', '接頭辞', '特殊', '未定義語' ], remove_stopwords=False, genkei=False): wkt = "" text = mojimoji.han_to_zen(text) rst = self.jumanpp.analysis(text) for mrph in rst.mrph_list(): # midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname if remove_stopwords and (mrph.genkei in self.stopwords): continue if mrph.hinsi in allow_word_class: if genkei: wkt += mrph.genkei + ' ' else: wkt += mrph.midasi + ' ' return wkt
def __init__(self, command='knp', option='-tab', rcfile='', server=None, port=31000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)', jumanrcfile='', juman_option='-e2 -B', juman_port=32000, juman_command='juman', jumanpp=False): self.use_jumanpp = (juman_command == "jumanpp") or jumanpp assert 'EOS' in pattern self.pattern = pattern self.EOS = 'EOS' # tab形式しかパースしない assert '-tab' in option if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) # Setup Juman(++) assert port != juman_port juman_args = {'option': juman_option, 'rcfile': jumanrcfile, 'server':server, 'port':juman_port} if self.use_jumanpp: self.juman = Jumanpp(**juman_args) else: self.juman = Juman(**juman_args) # Setup KNP if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option += " -r {}".format(rcfile) self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern)
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='juman', jumanrcfile='', jumanpp=False): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = (jumancommand == "jumanpp") or jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile) quit(1) if (self.jumanpp): self.juman = Jumanpp() else: self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)
def segment_ja(texts, flag_keep_number=False): jumanpp = Jumanpp() results = {} for text in texts: try: parsed = jumanpp.analysis(han_to_zen(text)) if flag_keep_number: segmented = ' '.join(m.midasi for m in parsed.mrph_list()) else: segmented = ' '.join('<数詞>' if m.bunrui == '数詞' else m.midasi for m in parsed.mrph_list()) results[text] = segmented except Exception: pdb.set_trace() logger.warning('Cannot parse {}'.format(text)) continue return results
def __init__(self, specific_parts: Optional[List[str]]=None, specific_domains: Optional[List[str]]=None): self.juman: Jumanpp = Jumanpp() if specific_parts is None: specific_parts: List[str] = ['普通名詞'] if specific_domains is None: specific_domains: List[str] = ['料理・食事'] self.specific_parts: List[str] = specific_parts self.specific_domains: List[str] = specific_domains self.words: Optional[List[str]] = None
def main(): model_w2v = gensim.models.KeyedVectors.load_word2vec_format( "/share/data/word2vec/2016.08.02/w2v.midasi.256.100K.bin", binary=True, unicode_errors='ignore') word2index = {w: i for i, w in enumerate(model_w2v.index2word)} model = BiLSTM(embed_mat=model_w2v.vectors, mid_size=128) serializers.load_npz("BiLSTM_attention.model", model) # 標準入力からテストできるように jumanpp = Jumanpp() while True: input_sentence = sys.stdin.readline() # 改行を含む, string型 result = jumanpp.analysis(input_sentence) doc = [mrph.midasi for mrph in result.mrph_list()] x = [doc2list(doc, word2index)] # x = list2Var([doc2vec(doc)], np.float32, False) with chainer.using_config("train", False): y, attn_list = model.predict(x) p = np.argmax(y[0].data) doc_class = ["新聞記事", " 雑誌 ", " 教科書 ", " ブログ "] print("") print("*------------------------*") print("| |") print("| " + doc_class[p] + " |") print("| |") print("*------------------------*") print("") prob = F.softmax(y, axis=1)[0].data print("新聞記事: {:.6f} 雑誌: {:.6f} 教科書: {:.6f} ブログ: {:.6f}".format( prob[0], prob[1], prob[2], prob[3])) for word, attn in sorted(zip(doc, attn_list), key=lambda x: x[1], reverse=True): print(word, end=", ") print("\n")
def __init__(self, command='jumanpp', timeout=30, pattern=r'EOS', server=None, port=12000, is_use_pyknp=False, **args): """* What you can do - You can select backend process of jumanpp. - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running. - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere. * Parameters - timeout: Time to wait from jumanpp process. - is_use_pyknp: bool flag to decide if you use pyknp as backend process. If True; you use pyknp. False; you use pexpect. pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns - server: hostname where jumanpp is running - port: port number where jumanpp is running """ # type: (str,int,str,str,bool)->None self.eos_pattern = pattern self.is_use_pyknp = is_use_pyknp if not server is None: pattern = pattern.encode('utf-8') else: pass if os.name=='nt': """It forces to use pyknp if it runs on Windows.""" if not self.is_use_pyknp: logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True") else: pass self.is_use_pyknp = True else: pass if server is None and self.is_use_pyknp: # jumanpp-pexpect # self.jumanpp_obj = Jumanpp( command=command, timeout=timeout, pattern=pattern, **args) elif server is None: # jumanpp-pexpect # self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern) else: # jumanpp-server # self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
def parse(line): if line == None: return if line == "\n": return jumanpp = Jumanpp() replaced = re.sub('\n|\u3000| ', '', line) result = jumanpp.analysis(replaced) words = [] for mrph in result.mrph_list(): if not mrph == None: print('{0} 読み: {1} 品詞: {2} 活用1: {3} 活用2: {4}'. format(mrph.midasi, mrph.yomi, mrph.hinsi, mrph.katuyou1, mrph.katuyou2)) words.append(mrph.midasi) return words
def parser_func_jumanpp(lemmatize: bool = True) -> Callable[[str], List[str]]: jumanpp = Jumanpp() if lemmatize: def f(s: str) -> List[str]: return [m.genkei for m in jumanpp.analysis(s)] return f else: def g(s: str) -> List[str]: return [m.midasi for m in jumanpp.analysis(s)] return g
def __init__(self, command='jumanpp', timeout=30, pattern=r'EOS', server:str=None, port:int=12000, **args): # type: (str, int, str, str) -> None if not server is None: pattern = pattern.encode('utf-8') self.eos_pattern = pattern if server is None: self.jumanpp_obj = Jumanpp( command=command, timeout=timeout, pattern=pattern, **args) else: self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
def __init__(self, command='jumanpp', timeout=30, pattern='EOS', server=None, port=12000, **args): # type: (str, int, str, str, int, Dict[str,Any]) -> None self.eos_pattern = pattern if server is None: self.jumanpp_obj = Jumanpp(command=command, timeout=timeout, pattern=pattern, **args) else: self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
def __init__( self, word2vec_model: Word2VecModel, juman_command: str='jumanpp', specific_parts: Optional[List[str]]=None ) -> None: if specific_parts is None: specific_parts = ['普通名詞'] if juman_command == 'juman': self.juman: Union[Juman, Jumanpp] = Juman() elif juman_command == 'jumanpp': self.juman: Union[Juman, Jumanpp] = Jumanpp() else: raise AttributeError self.knp: KNP = KNP(jumancommand=juman_command) self.specific_parts: List[str] = specific_parts self.word2vec: Word2VecModel = word2vec_model
class JumanParser(Parser): def __init__(self): super().__init__() remove_pattern = r'・|、|\,|\.| | ' self.remove_compiled = re.compile(remove_pattern) self.analyzer = Jumanpp() def parse(self, message): for sent in message.sentences: sent.text = self.remove_compiled.sub('', sent.text) parsed = self.analyzer.analysis(sent.text) mrph_list = parsed.mrph_list() sent.bag = self.create_bags(mrph_list) message.bags += sent.bag return message @staticmethod def create_bags(mrph_list): bag = [] for mrph in mrph_list: if mrph.hinsi == '名詞' or mrph.hinsi == '動詞': bag.append(mrph.genkei) return bag
class IntentSlotDatasetReader(DatasetReader): def __init__(self, lazy=False, max_tokens=64): super().__init__(lazy) self.token_indexers = {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens self.jumanpp = Jumanpp() def _read(self, file_path): with open(file_path, 'r') as f: for line in f: line = line.strip().split() label = line[-1] line = [tt.split(':') for tt in line[:-2]] text = [Token(tt[0]) for tt in line][0:self.max_tokens] tags = [tt[1] for tt in line][0:self.max_tokens] yield self.text_to_instance(text, label, tags) def tokenizer(self, text): text = [ Token(mrph.midasi) for mrph in self.jumanpp.analysis(text).mrph_list() ][0:self.max_tokens] return text def text_to_instance(self, text, label=None, tags=None): text_field = TextField(text, self.token_indexers) fields = {'text': text_field} if label: label_field = LabelField(label, label_namespace='labels') fields['label'] = label_field if tags: tags_field = SequenceLabelField(tags, text_field, label_namespace='tags') fields['tag'] = tags_field return Instance(fields)
def __init__(self): super().__init__() remove_pattern = r'・|、|\,|\.| | ' self.remove_compiled = re.compile(remove_pattern) self.analyzer = Jumanpp()
import os import sys import json import pickle import numpy as np import gensim as gs import pandas as pd from dict import Vocabulary from pyknp import Jumanpp jumanpp = Jumanpp() def load_embeddings(vocabulary): word_embeddings ={} for word in vocabulary: word_embeddings[word] = np.random.uniform(-0.25,0.25,300) return word_embeddings def pad_data(data,size,pad_index): new_data = [] for data_ in data: if len(data_) >= size: data_ = data_[:size] else: while len(data_) < size: data_.append(pad_index) new_data.append(data_) return new_data def batch_iter(data,batch_size,num_epochs, shuffle=True): data= np.array(data)
def split_into_words(text): result = Jumanpp().analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
#-*- encoding: utf-8 -*- from pyknp import Jumanpp import sys import codecs # sys.stdin = codecs.getreader('utf_8')(sys.stdin) # sys.stdout = codecs.getwriter('utf_8')(sys.stdout) # Use Juman++ in subprocess mode jumanpp = Jumanpp() result = jumanpp.analysis(u"ケーキを食べる") for mrph in result.mrph_list(): print("見出し:{0}".format(mrph.midasi))
with open(args[1], "r") as f: textslist = [s.split("\t")[1].strip() for s in f if len(s.split("\t")) > 1] with open(args[2], "r") as f: wordslist = [s.split("\t")[1].strip() for s in f if len(s.split("\t")) > 1] tlist = [ w.replace(" ", "_") for w in textslist if w is not "" and not isAscii(w) ] wlist = [w.replace(" ", "_") for w in wordslist if w is not ""] t_midasi = [] w_midasi = [] jumanpp = Jumanpp() """ for i,s in enumerate(tlist): print("Processing Text:{}".format(i)) if s == "": continue result = jumanpp.analysis(s) midasi_lst = [] for w in result.mrph_list(): midasi_lst.append([w.midasi.replace("_"," "),"O"]) t_midasi.append(midasi_lst) """ print("-----------------") for i, s in enumerate(wlist): print("Processing Word:{}".format(i))
from pyknp import Jumanpp parser = argparse.ArgumentParser() parser.add_argument("--input_text", help="classify text", type=str, default="日本でのビジネス") parser.add_argument("--path_to_model", help="model to use", type=str, default="./models/my-model.ckpt") args = parser.parse_args() jumanpp = Jumanpp() classify_data = [] vocab = Vocabulary("data_use.txt") result = jumanpp.analysis(args.input_text) for mrph in result.mrph_list(): word = mrph.midasi classify_data.append(vocab.stoi(word)) classify_data = data_helper.pad_one(classify_data, 256, 0) with open("training_config.json") as f: params = json.load(f) embedding_mat = np.load("./models/embedding.npy")
def split_into_words(text): '''記事を単語リストに変換する''' result = Jumanpp().analysis(text) return [mrph.midasi for mrph in result.mrph_list()]