def dst_srcs(f): data = mk41.load_cabocha(f) phrase_dst_ls = list() sentence_ls = list() all_ls = list() pattern = re.compile(r"。|、|\ ") # [我輩は, 5]とするリスト作成 for sentence in data: for line in sentence: phrase_dst_ls.append(pattern.sub("", line.phrase)) # 句読点等削除 phrase_dst_ls.append(line.dst) sentence_ls.append(phrase_dst_ls) phrase_dst_ls = list() all_ls.append(sentence_ls) sentence_ls = list() return all_ls
(中略) このプログラムの出力をファイルに保存し,以下の事項をUNIXコマンドを用いて確認せよ. ・コーパス中で頻出する述語(サ変接続名詞+を+動詞) ・コーパス中で頻出する述語と助詞パターン """ """ chunk = [morphs=[[surface, base, pos, pos1], ...], dst, srcs, phrase] chunk = [[[我輩, 我輩, 名詞, 代名詞], [は, は, 助詞, 助詞]], 2, [0], 我輩は] """ import mk41 import sys, re data = mk41.load_cabocha(sys.stdin) pattern = re.compile(r"。|、|「|」|\*| ") for sentence in data: for chunk in sentence: verb = "" dst_ls = list() kou_ls = list() for morph in chunk.morphs: if morph.pos == "動詞": verb = morph.base break if verb != "" and len(chunk.srcs) > 0: sahen_wo = "" for src_index in chunk.srcs:
# coding=utf-8 """ 43. 名詞を含む文節が動詞を含む文節に係るものを抽出 名詞を含む文節が,動詞を含む文節に係るとき,これらをタブ区切り形式で抽出せよ.ただし,句読点などの記号は出力しないようにせよ. """ """ chunk = [morphs=[[surface, base, pos, pos1], ...], dst, srcs, phrase] chunk = [[[我輩, 我輩, 名詞, 代名詞], [は, は, 助詞, 助詞]], 2, [0], 我輩は] """ import sys, re import mk41 data = mk41.load_cabocha(sys.stdin) pattern = re.compile(r"。|、| ") for sentence in data: for chunk in sentence: for morpheme in chunk.morphs: if morpheme.pos == "名詞" and chunk.dst != -1: noun_src = pattern.sub("", chunk.phrase) # 句読点等削除 for is_verb_morphs in sentence[chunk.dst].morphs: if is_verb_morphs.pos == "動詞": verb_dst = pattern.sub( "", sentence[chunk.dst].phrase) # 句読点等削除 print noun_src + "\t" + verb_dst break break """ $ python mk43.py < neko.txt.cabocha どこで 生れたか
def main(): data = mk41.load_cabocha(sys.stdin) pattern = re.compile(r"。|、|「|」|\*| ") # noun_flag = 0 path_ls = list() for sentence in data: for chunk in sentence: noun_flag = 0 for morph in chunk.morphs: if morph.pos == "名詞": # 文節chunkに名詞があるか noun_flag = 1 break if noun_flag == 1: # 名詞があったら all_path = list() all_path.append(chunk) # 係り元のchunkをall_pathリストに追加 # pass_ls.append(chunk.phrase) dst_path = chunk.dst while (dst_path != -1): # 係り先がある限り dst_chunk = sentence[dst_path] all_path.append(dst_chunk) # 係り先のchunkをall_pathリストに追加 dst_path = dst_chunk.dst path_ls.append(all_path) list_of_nodes = [{"chunk": l[0], "path": l} for l in path_ls] for node_i, node_j in combinations(list_of_nodes, 2): if node_j["chunk"] in node_i[ "path"]: # node_iから構文木の根までのパスにnode_jが存在する場合 index_j_in_i = node_i["path"].index(node_j["chunk"]) path_ij = node_i["path"][0:index_j_in_i + 1] path_ij_surfaces = [chunk.phrase for chunk in path_ij] path_ij_surfaces[0] = replace_noun_phrase_in_chunk(path_ij[0], "X") path_ij_surfaces[-1] = "Y" print " -> ".join(path_ij_surfaces) else: # node_iと文節node_jから構文木の根までのパス上で共通のnode_kで交わる場合 node_k_chunk = None h = -1 while (node_i["path"][h] == node_j["path"][h]): node_k_chunk = node_i["path"][h] h = h - 1 if node_k_chunk != None: # node_iからnode_kまでのパスを取り出す index_k_in_i = node_i["path"].index(node_k_chunk) path_ik = node_i["path"][0:index_k_in_i] path_ik_surfaces = [ re.sub(pattern, "", chunk.phrase) for chunk in path_ik ] path_ik_surfaces[0] = replace_noun_phrase_in_chunk( path_ik[0], "X") # node_jからnode_kまでのパスを取り出す index_k_in_j = node_j["path"].index(node_k_chunk) path_jk = node_j["path"][0:index_k_in_j] path_jk_surfaces = [ re.sub(pattern, "", chunk.phrase) for chunk in path_jk ] path_jk_surfaces[0] = replace_noun_phrase_in_chunk( path_jk[0], "Y") print " -> ".join(path_ik_surfaces) \ + " | " + " -> ".join(path_jk_surfaces) \ + " | " + re.sub(pattern, "", node_k_chunk.phrase)