def run(self): data = self.load() jumanpp = Juman() output = [] for _, row in data.iterrows(): zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True) splited = [ mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list() ] if self.task_name == 'QA_B': qa_zenkaku = jaconv.h2z( f"{row['target']}の{row['aspect']}は{row['sentiment']}", ascii=True, digit=True, ) else: qa_zenkaku = " " qa_splited = [ mrph.midasi for mrph in jumanpp.analysis(qa_zenkaku).mrph_list() ] output.append({ "context": " ".join(splited), "qa": " ".join(qa_splited), "label": 1 }) self.dump(pd.DataFrame(output))
def initialize(fword, tword, modelfn, start, debug): juman = Juman() # parse and check from_word ms_f = juman.analysis(fword).mrph_list() if len(ms_f) > 1: print(u'{} is parsed multiple words'.format(fword)) exit(1) wm_f = ms_f[0] if not wm_f.repname: print(u'no repname with {}'.format(fword)) exit(1) fword = wm_f.repname # parse and check to_word ms_t = juman.analysis(tword).mrph_list() if len(ms_t) > 1: print(u'{} is parsed multiple words'.format(tword)) exit(1) wm_t = ms_t[0] if not wm_t.repname: print(u'no repname with {}'.format(tword)) exit(1) tword = wm_t.repname # load and check model print(u'loading model...') if modelfn.split('.')[-1] == 'model': model = Word2Vec.load(modelfn) elif modelfn.split('.')[-1] == 'bin': model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore') if fword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(fword)) exit(1) elif tword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(tword)) exit(1) model.save('hs0.100m.500.5.18mgt100.model') t1 = time.clock() - start if debug: printtime(t1) print(u'constructing id2vocab map...') id2vocab = {} for i, v in enumerate(model.vocab): id2vocab[i] = v t2 = time.clock() - t1 if debug: printtime(t2) print(u'constructing V...') V = [] for v in model.vocab: V.append(model[v]) V = np.vstack(V) t3 = time.clock() - t2 if debug: printtime(t3) return fword, tword, model, V, id2vocab, t3
class JumanTokenizer: def __init__(self): self.juman = Juman() def __call__(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def analyzer(): bc = BertClient(ip='bertserving', output_fmt='list') client = Elasticsearch('elasticsearch:9200') texts = [] list_text = [] jumanpp = Juman() query = request.args.get('q') result = jumanpp.analysis(query) for mrph in result.mrph_list(): texts.append(mrph.midasi) list_text.append(" ".join(texts)) query_vector = bc.encode(list_text, is_tokenized=False)[0] script_query = { "script_score": { "query": { "match": { "source": "tb" } }, "script": { "source": "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0", "params": { "query_vector": query_vector } } } } response = client.search(index=INDEX_NAME, body={ "size": SEARCH_SIZE, "query": script_query }) return jsonify(response)
def main(DATA_ROOT): text_files = Path(DATA_ROOT).glob('**/*.txt') for text_file in text_files: with open(text_file) as f: content = f.read() content = re.sub(r"=+(.*?)=+", "\g<1>", content) content = re.sub(r"^\n", "", content, flags=re.MULTILINE) content = content.replace('<block>', '') content = content.replace('<math-element>', '') # In this case, 。 can be removed safely sentences = re.split(r"[。\n]", content) sentences = [line for line in sentences if len(line) != 0] sentences = [''.join(line.split()) for line in sentences] # Remove sentence which is not properly parsed val_sentences = [] offsets = [] juman = Juman() for sentence in tqdm(sentences): # Try to parse try: result = juman.analysis(sentence) except ValueError: print(sentence) except Exception as e: raise e current = 0 offset = [0 for _ in range(len(sentence))] for mrph in result.mrph_list(): current = current + len(mrph.midasi) try: offset[current - 1] = 1 except IndexError as e: print(sentence) print(current) for _mrph in result.mrph_list(): print(_mrph.midasi) raise e except Exception as e: raise e val_sentences.append(sentence) offsets.append(offset) results = (sentences, offsets) file_name = text_file.name[:-4] + '.pickle' dic = text_file.parent with open(Path(dic, file_name), 'wb') as f: pickle.dump(results, f)
class SentimentAnalysis: def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str): self.jumanpp = Juman(command=jumanpp_command) self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False) self.model = BertPosNegClassifier(bert_model) state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu')) self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()}) self.model.eval() def get_prediction(self, sentence: str) -> int: print(sentence) text: str = self._segmentation(sentence) tokenized_text: List[str] = ['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]'] indexed_tokens: List[int] = [self.tokenizer.convert_tokens_to_ids(tokenized_text)] tokens_tensor = torch.tensor(indexed_tokens) attention_mask_tensor = torch.tensor([[1] * len(tokenized_text)]) # segments_tensors = torch.tensor([ [0] * len(indexed_tokens_list[0]) for _ in range(text_length)]) output: torch.Tensor = self.model(tokens_tensor, attention_mask=attention_mask_tensor) prediction: int = torch.argmax(output[0]).item() # 0 or 1 if prediction == 0: prediction = -1 return prediction def _segmentation(self, text: str) -> str: result = self.jumanpp.analysis(text) return ' '.join(mrph.midasi for mrph in result.mrph_list())
def _apply_jumanpp(self, inp: str) -> Tuple[str, str]: jumanpp = Juman(command=self.juman, option=self.juman_option) jumanpp_result = jumanpp.analysis(inp) jumanpp_out = jumanpp_result.spec() + 'EOS\n' jumanpp_conll_out = self._jumanpp2conll_one_sentence( jumanpp_out) + 'EOS\n' return jumanpp_out, jumanpp_conll_out
class JumanService(object): def __init__(self): self.__juman = Juman() def analysis(self, string): formattedString = JumanKnpUtil.format_input_string(string) return self.__juman.analysis(formattedString)
class JumanTokenizer(): _trans_tables = str.maketrans({"\"": "", "@": "@", "#": "#"}) def __init__(self, ): self.juman = Juman() def _preprocess(self, sentences): return sentences.replace(" ", "").replace("\n", "").translate(self._trans_tables) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()] def _preprocess_list(self, datas): return [[x, self._preprocess(x)] for x in datas] def _tokenize_for_multi(self, datas): try: return [datas[0], self.tokenize(datas[1])] except: return [] def tokenize_multi(self, datas, thread=cpu_count()): datas = self._preprocess_list(datas) num_of_datas = len(datas) with Pool(thread) as pool: imap = pool.imap_unordered(self._tokenize_for_multi, datas) result = list(tqdm(imap, total=num_of_datas)) return result
class JumanTokenizer(): def __init__(self): self.juman = Juman(jumanpp=True) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
class JumanTokenizer: def __init__(self, command, options): self.juman = Juman(command, options) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
class Tokenizer: def __init__(self): self.jumanpp = Juman() self.replace_map = {'"': '’', '#': '‘'} self.rev_map = {s: t for t, s in self.replace_map.items()} def tokenize(self, text): sequence = [] for line in text.split('\n'): for sentence in line.split(' '): for tgt_symbol, sub_symbol in self.replace_map.items(): sentence = sentence.replace(tgt_symbol, sub_symbol) result = self.jumanpp.analysis(sentence) for mrph in result.mrph_list(): midasi = mrph.midasi if midasi in self.rev_map: midasi = self.rev_map[midasi] sequence.append(midasi) sequence.append(' ') del sequence[-1] sequence.append('\n') del sequence[-1] return sequence
def string_word_point(self, df): jumanpp = Juman(jumanpp=False) tmp_word =[] df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf #print(df_word_point) for i in range(len(df)): #URLだったら追加 url=URL_hanbetu(df['comment'][i]) if url != False: tmp = self.my_index(df_URL_point['URL'],url) df_URL_point = self.make_df_append(df_URL_point,tmp,url) #print("記号削除前") #print(df_word_point) #記号削除中 print(df['comment'][i]) df['comment'][i] = self.my_delete(df['comment'][i]) # h:m:s -> hms に変更 tmp_time = self.strtime_to_inttime(df['time'][i]) #時間ごとのコメント数計算 tmp = self.my_index(df_time_point['time'],tmp_time) df_time_point = self.make_df_append(df_time_point,tmp,tmp_time) #wwwがあったら1追加なかったら0追加 print(url) if False != self.www_hanbetu(df['comment'][i]) and url == False: df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time) else: if False == tmp : df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #拍手があったら1追加なかったら0追加 if False != self.hakusyu_hanbetu(df['comment'][i]): df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time) else: if False == tmp : df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #構文解析 result = jumanpp.analysis(df['comment'][i]) #print(result) #分析結果からdf作成 for token in result.mrph_list(): tmp_word = token.midasi #名詞の出現数計算 if 0 != self.word_Classification(token.hinsi): #名詞なら if self.word_Classification(token.hinsi) == '名詞': tmp = self.my_index(df_word_point['word'],tmp_word) df_word_point = self.make_df_append(df_word_point,tmp,tmp_word) #名詞とその時の時間 df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True) return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
def morphological_analysis(self, text): jumanpp = Juman() ret = [] text = self.remove_special_character(text) result = jumanpp.analysis(text) # これでスペースで単語が区切られる for mrph in result.mrph_list(): ret += self.modification(mrph.midasi) return ret
class JumanTokenizer: def __init__(self): self.juman = Juman(command=config['Juman']['command'], option=config['Juman']['option']) def __call__(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
class JumanTokenizer(): def __init__(self): self.juman = Juman() def tokenize(self, text): # Jumanを用いて、日本語の文章を分かち書きする。 result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
class JumanTokenize(object): """Runs JumanTokenizer.""" def __init__(self): self.juman = Juman() def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def counter(text, d): jumanapp = Juman() result = jumanapp.analysis(text) for mrph in result.mrph_list(): if mrph.genkei in d: d[mrph.genkei] = d[mrph.genkei] + 1 else: d[mrph.genkei] = 1
def test_juman_wrapper(self): juman = Juman() result = juman.analysis(u"これはペンです。") print(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
def juman_list(text): jumanpp = Juman() result = jumanpp.analysis(text) # アルファベットは全て "En" という文字列に置き換える wakati = [ mrph.genkei if mrph.bunrui != "アルファベット" else "En" for mrph in result.mrph_list() ] return ",".join(wakati)
def juman_test(): juman = Juman() print dir(juman) text = "テストテキスト" utext = unicode("".join(text.split())) print( u'"'+utext+u'"' ) juman_result = juman.analysis( utext ) for mrph in juman_result.mrph_list(): print( '> ' + mrph.midasi + ' : ' + mrph.yomi + ' : ' + mrph.genkei )
class JumanTokenizer(): def __init__(self): self.juman = Juman() def tokenize(self, text): #pdb.set_trace() result = self.juman.analysis(text) #pdb.set_trace() return [mrph.midasi for mrph in result.mrph_list()]
class JumanTokenizer: def __init__(self): self.juman = Juman() def tokenize(self, text): text.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) text = re.sub(r'\s', ' ', text) result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def get_repname_using_jumanpp(genkei: str, pos: str) -> str: if pos == '助詞': return f'{genkei}/{genkei}' juman = Juman(option='-s 1') mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE) # 形態素解析が誤っていないか(=1形態素になっているか)をチェック if len(mrphs) == 1: return mrphs[0].repname return f'{genkei}/{genkei}'
def test_juman_wrapper(self): try: juman = Juman(command=self.path_to_juman_command) result = juman.analysis("これはペンです。") logger.debug(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname)) except ImportError: print('skip test_juman_wrapper')
class JumanTokenizer: def __init__(self): self.juman = Juman() def parse(self, sentence: str, out=[]) -> list: out_list = out # Juman では句読点等は"特殊"に分類されている if "記号" in out: out_list.append("特殊") return [m.midasi for m in self.juman.analysis(sentence).mrph_list() if m.hinsi not in out_list]
def seg2word(seg): len_split = 1000 # seg = seg_in.replace(' ', '\u3000') # seg = seg_in.replace(' ', ' ') len_seg = len(seg) seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)] juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp") return ' '.join([ " ".join( [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()]) for seg_part in seg_splits ])
def bulk_predict(docs, batch_size=256): """Predict bert embeddings.""" jumanpp = Juman(jumanpp=False) for i in range(0, len(docs), batch_size): batch_docs = docs[i: i+batch_size] pre_embedding_docs = [] for doc in batch_docs: for k in range(0, len(doc['question']), MAX_TXT_LENGTH) result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH]) texts = [mrph.midasi for mrph in result.mrph_list()] pre_embedding_docs.append(" ".join(texts)) embeddings = bc.encode(pre_embedding_docs,is_tokenized=True) yield emb
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False): juman = Juman() output = "" # wakati result = juman.analysis(text) for mrph in result.mrph_list(): if STEM_FLAG and mrph.hinsi in hinshi: output += mrph.repname.split("/")[0] + " " if DEBUG: print("stem:", mrph.repname) print("midashi:", mrph.repname) print("hinsi:", mrph.hinsi) print("yomi:", mrph.yomi) return output.strip()
def jumanpp(): if request.method in ['POST'] and \ request.headers['Content-Type'] == 'application/json': sentence = request.get_json()['sentence'] juman = Juman(jumanpp=True) result = juman.analysis(sentence) words = [] info = [] for morph in result.mrph_list(): words.append(morph.midasi) info.append('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( morph.hinsi, morph.bunrui, morph.katuyou1, morph.katuyou2, morph.yomi, morph.genkei, morph.repname, morph.imis)) response = {'words': words, 'info': info} return jsonify(response) return jsonify({})
def title_clean(title_ls): tmp_ls = copy.deepcopy([title_ls]) for i in range(len(tmp_ls) - 1): if tmp_ls[i] is None: del tmp_ls[i] for i in range(len(tmp_ls)): tmp_ls[i] = normalize('NFKC', tmp_ls[i]) tmp_ls[i] = tmp_ls[i].replace(' ', '') tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i]) jumanpp = Juman() sep_ls = [] for tmp in tmp_ls: sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)])) return sep_ls[0]
def read_home_timeline( session ): print( '[kazuha] - read timeline.' ) juman = Juman() req = session.get( twitter.API_home_timeline, params = {} ) if req.status_code == 200: timeline = json.loads( req.text ) for tweet in timeline: u_tweet_text = unicode( "".join(tweet["text"].split()) ) print( u'[kazuha] - read timeline: '+ u_tweet_text ) juman_result = juman.analysis( u_tweet_text ) for mrph in juman_result.mrph_list(): print u"%s - (%s, %s)" % (mrph.genkei, mrph.hinsi, mrph.bunrui) #end for #end for else: print( '[kazuha] - read timeline: failure.' )
def word_distance(s1, s2): juman = Juman() r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2)) if r > len((s1 + s2).replace(" ", "")) // 2: return word_distance_en(s1, s2) sss = [ set( [item.midasi for item in juman.analysis(ss).mrph_list() \ if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\ or '内容語' in item.imis ] ) for ss in [s1, s2] ] if min(len(sss[0]), len(sss[1])) == 0: return 0 return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
class Solver(object): def __init__(self): self.juman = Juman() self.knp = KNP() def Q61(self): u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入) """ input_sentence = raw_input() result = self.juman.analysis(input_sentence.decode("utf8")) for mrph in result.mrph_list(): sys.stdout.write("{} ".format(mrph.midasi.encode("utf8"))) sys.stdout.write("\n") return def Q62(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞") # 名詞だけ表示 if len(s) > 0: print(s) data = u"" def Q63(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞") # 動詞だけ表示 if len(s) > 0: print(s) data = u"" def Q64(self): u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ ヒント: ディクショナリ、sorted 関数を使う """ data = u"" hist = {} for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) for mrph in result.mrph_list(): try: hist[mrph.genkei] += 1 except KeyError: hist[mrph.genkei] = 1 data = u"" for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True): print("{},{}".format(key.encode("utf8"), val)) def Q65(self): u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする """ data = u"" num = 0 denom = 0 for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) if verbose: logger.info("denom: {}".format(denom)) for mrph in result.mrph_list(): denom += 1 if mrph.hinsi == u"動詞": num += 1 continue if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"): num += 1 continue if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"): num += 1 continue data = u"" print("{}/{}={}".format(num, denom, float(num) / denom)) def Q66(self): u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = None for mrph in result.mrph_list(): if mrph.genkei == u"できる" or mrph.genkei == u"する": if buff is not None: extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8"))) if mrph.bunrui == u"サ変名詞": buff = mrph else: buff = None data = u"" for t in extract: print("{}+{}".format(t[0], t[1])) def Q67(self): u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = [] for mrph in result.mrph_list(): if mrph.genkei == u"の" and len(buff) == 1: buff.append(u"の") continue if mrph.hinsi == u"名詞": if len(buff) == 0: buff.append(mrph.genkei) continue if len(buff) == 2: extract.add((buff[0], mrph.genkei)) buff = [] data = u"" for t in extract: print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8"))) def Q68(self): u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入) """ input_sentence = raw_input() result = self.knp.parse(input_sentence.decode("utf8")) for bnst in result.bnst_list(): sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) sys.stdout.write("\n") return def Q69(self): u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return def Q70(self): u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return
# coding: utf-8 from pyknp import Juman import sys import codecs juman = Juman() input_file = "../data/sample.txt" f = codecs.open(input_file, 'r', 'utf-8') f_out = codecs.open(input_file + '_juman_result.txt','w', 'utf-8') for line in f: result = juman.analysis(line[:-1].replace(" ", "")) #print ' '.join(mrph.midasi for mrph in result) f_out.write(' '.join(mrph.midasi for mrph in result) + '\n')