コード例 #1
0
ファイル: mk42.py プロジェクト: masakuri/nlp100_2016
def dst_srcs(f):
    data = mk41.load_cabocha(f)
    phrase_dst_ls = list()
    sentence_ls = list()
    all_ls = list()
    pattern = re.compile(r"。|、|\ ")

    # [我輩は, 5]とするリスト作成
    for sentence in data:
        for line in sentence:
            phrase_dst_ls.append(pattern.sub("", line.phrase))  # 句読点等削除
            phrase_dst_ls.append(line.dst)
            sentence_ls.append(phrase_dst_ls)
            phrase_dst_ls = list()
        all_ls.append(sentence_ls)
        sentence_ls = list()

    return all_ls
コード例 #2
0
ファイル: mk42.py プロジェクト: masakuri/nlp100_2016
def dst_srcs(f):
    data = mk41.load_cabocha(f)
    phrase_dst_ls = list()
    sentence_ls = list()
    all_ls = list()
    pattern = re.compile(r"。|、|\ ")

    # [我輩は, 5]とするリスト作成
    for sentence in data:
        for line in sentence:
            phrase_dst_ls.append(pattern.sub("", line.phrase))  # 句読点等削除
            phrase_dst_ls.append(line.dst)
            sentence_ls.append(phrase_dst_ls)
            phrase_dst_ls = list()
        all_ls.append(sentence_ls)
        sentence_ls = list()

    return all_ls
コード例 #3
0
ファイル: mk47.py プロジェクト: masakuri/nlp100_2016
(中略)
このプログラムの出力をファイルに保存し,以下の事項をUNIXコマンドを用いて確認せよ.

・コーパス中で頻出する述語(サ変接続名詞+を+動詞)
・コーパス中で頻出する述語と助詞パターン
"""

"""
chunk = [morphs=[[surface, base, pos, pos1], ...], dst, srcs, phrase]
chunk = [[[我輩, 我輩, 名詞, 代名詞], [は, は, 助詞, 助詞]], 2, [0], 我輩は]
"""

import mk41
import sys, re

data = mk41.load_cabocha(sys.stdin)
pattern = re.compile(r"。|、|「|」|\*| ")
for sentence in data:
    for chunk in sentence:
        verb = ""
        dst_ls = list()
        kou_ls = list()

        for morph in chunk.morphs:
            if morph.pos == "動詞":
                verb = morph.base
                break

        if verb != "" and len(chunk.srcs) > 0:
            sahen_wo = ""
            for src_index in chunk.srcs:
コード例 #4
0
# coding=utf-8
"""
43. 名詞を含む文節が動詞を含む文節に係るものを抽出
名詞を含む文節が,動詞を含む文節に係るとき,これらをタブ区切り形式で抽出せよ.ただし,句読点などの記号は出力しないようにせよ.
"""
"""
chunk = [morphs=[[surface, base, pos, pos1], ...], dst, srcs, phrase]
chunk = [[[我輩, 我輩, 名詞, 代名詞], [は, は, 助詞, 助詞]], 2, [0], 我輩は]
"""

import sys, re
import mk41

data = mk41.load_cabocha(sys.stdin)
pattern = re.compile(r"。|、| ")

for sentence in data:
    for chunk in sentence:
        for morpheme in chunk.morphs:
            if morpheme.pos == "名詞" and chunk.dst != -1:
                noun_src = pattern.sub("", chunk.phrase)  # 句読点等削除
                for is_verb_morphs in sentence[chunk.dst].morphs:
                    if is_verb_morphs.pos == "動詞":
                        verb_dst = pattern.sub(
                            "", sentence[chunk.dst].phrase)  # 句読点等削除
                        print noun_src + "\t" + verb_dst
                        break
                break
"""
$ python mk43.py < neko.txt.cabocha
どこで  生れたか
コード例 #5
0
def main():
    data = mk41.load_cabocha(sys.stdin)
    pattern = re.compile(r"。|、|「|」|\*| ")
    # noun_flag = 0
    path_ls = list()

    for sentence in data:
        for chunk in sentence:
            noun_flag = 0
            for morph in chunk.morphs:
                if morph.pos == "名詞":  # 文節chunkに名詞があるか
                    noun_flag = 1
                    break
            if noun_flag == 1:  # 名詞があったら
                all_path = list()
                all_path.append(chunk)  # 係り元のchunkをall_pathリストに追加
                # pass_ls.append(chunk.phrase)
                dst_path = chunk.dst
                while (dst_path != -1):  # 係り先がある限り
                    dst_chunk = sentence[dst_path]
                    all_path.append(dst_chunk)  # 係り先のchunkをall_pathリストに追加
                    dst_path = dst_chunk.dst
                path_ls.append(all_path)

    list_of_nodes = [{"chunk": l[0], "path": l} for l in path_ls]
    for node_i, node_j in combinations(list_of_nodes, 2):
        if node_j["chunk"] in node_i[
                "path"]:  # node_iから構文木の根までのパスにnode_jが存在する場合
            index_j_in_i = node_i["path"].index(node_j["chunk"])
            path_ij = node_i["path"][0:index_j_in_i + 1]
            path_ij_surfaces = [chunk.phrase for chunk in path_ij]
            path_ij_surfaces[0] = replace_noun_phrase_in_chunk(path_ij[0], "X")
            path_ij_surfaces[-1] = "Y"

            print " -> ".join(path_ij_surfaces)

        else:  # node_iと文節node_jから構文木の根までのパス上で共通のnode_kで交わる場合
            node_k_chunk = None
            h = -1
            while (node_i["path"][h] == node_j["path"][h]):
                node_k_chunk = node_i["path"][h]
                h = h - 1

            if node_k_chunk != None:
                # node_iからnode_kまでのパスを取り出す
                index_k_in_i = node_i["path"].index(node_k_chunk)
                path_ik = node_i["path"][0:index_k_in_i]
                path_ik_surfaces = [
                    re.sub(pattern, "", chunk.phrase) for chunk in path_ik
                ]
                path_ik_surfaces[0] = replace_noun_phrase_in_chunk(
                    path_ik[0], "X")

                # node_jからnode_kまでのパスを取り出す
                index_k_in_j = node_j["path"].index(node_k_chunk)
                path_jk = node_j["path"][0:index_k_in_j]
                path_jk_surfaces = [
                    re.sub(pattern, "", chunk.phrase) for chunk in path_jk
                ]
                path_jk_surfaces[0] = replace_noun_phrase_in_chunk(
                    path_jk[0], "Y")

                print " -> ".join(path_ik_surfaces) \
                    + " | " + " -> ".join(path_jk_surfaces) \
                    + " | " + re.sub(pattern, "", node_k_chunk.phrase)