Example #1
0
def load_chunk_cabocha(file="./data/neko.txt.cabocha") -> T:
    results = []  # type: T
    chunks = []  # type: C
    current = -1  # 現在参照している文節番号
    srcs = {}  # type: Dict[int, List[int]] # key: 文節番号, value: 文節に係る元の文節番号

    for line in open(file, encoding="utf-8"):
        line = line.rstrip("\n")
        # 行頭が * で始まる行
        if line.startswith("*"):
            current += 1
            elem = line.split(" ")
            dst = int(elem[2][:-1])
            # 文節のリストに文節を追加する
            chunks.append(Chunk([], dst, []))
            if dst != -1:
                if dst in srcs:
                    srcs[dst].append(int(elem[1]))
                else:
                    srcs[dst] = [int(elem[1])]
        # EOS
        elif line == "EOS":
            if chunks == []:
                continue
            for key, value in srcs.items():
                chunks[key].srcs = value
            results.append(chunks)
            # 初期化
            chunks, current, srcs = [], -1, {}
        else:
            splitted = re.split("[\t,]", line)
            chunks[current].morphs.append(
                Morph(splitted[0], splitted[7], splitted[1], splitted[2]))

    return results
Example #2
0
 def __init__(self):
     #1文をChunkオブジェクトのリスト -> 1 sentencs : list of chunks
     chunks = []
     self.sentences = []
     #file = "test.cabocha"
     file = "neko.txt.cabocha"
     for line in open(file):
         line = line.strip(
             "\n"
         )  # must specific \n, if not it will erase whitespace which we treate as moji
         if line == "EOS":
             # get src
             for index, item in enumerate(chunks):
                 dst = chunks[index].dst
                 if dst != -1:  #終点じゃない場合のみ
                     chunks[dst].srcs.append(index)
             self.sentences.append(chunks)
             #reset
             chunks = []
         elif line.startswith("*") == True:  # * 0 -1D 0/0 0.000000
             parts = line.split(" ")
             item = Chunk()
             item.dst = int(parts[2].replace("D", ""))
             chunks.append(item)
         else:  # mecab:  表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
             surface, rest = line.split("\t")
             mecab = rest.split(",")
             item = Morph(surface=surface,
                          base=mecab[-3],
                          pos=mecab[0],
                          pos1=mecab[1])
             chunks[-1].morphs.append(item)  # -1 for last element of chunks
Example #3
0
def get_chunks_list(path):
    sentences = []
    chunks = {}
    with open(path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            line = line.rstrip()
            if line == 'EOS':
                if len(chunks) > 0:
                    sentence = [chunks[key] for key in sorted(chunks.keys())]
                    sentences.append(sentence[:])
                    chunks = {}
            elif line[0] == '*':
                s = line.split()
                current_num = int(s[1])
                dst = int(s[2][:-1])
                if current_num not in chunks:
                    chunks[current_num] = Chunk([], dst, [])
                chunks[current_num].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk([], -1, [])
                    chunks[dst].srcs.append(current_num)
            else:
                elements = re.split('[\t,]', line)
                morph = Morph(elements[0], elements[7], elements[1],
                              elements[2])
                chunks[current_num].morphs.append(morph)
    return sentences
Example #4
0
def F41():
    sentenses= list()
    morphs = list()
    chunks = list()
    for line in sys.stdin:
        if line.startswith("*"):
            # *が来たのを合図に前のmorphsをchunkとして格納
            #初めの*だけ例外
            if not(len(morphs) == 0):          
                c = Chunk(morphs,dst,srcs)
                chunks.append(c)
                morphs = list()
            sp_line = line.split(" ")
            chunk_index = int(sp_line[1])
            dst = int(sp_line[2].rstrip("D"))
            srcs = GetSrcs(chunks,chunk_index)
            
        elif line.startswith("EOS"):
            #EOSでも同様に前のmorphsをchunkとして格納
            #その後chunksをsenteseとして格納
            c = Chunk(morphs,dst,srcs)
            chunks.append(c)
            morphs = list()
            sentenses.append(chunks)
            chunks = list()
        else:
            sp_line2 = line.split("\t")
            sp_line3 = sp_line2[1].split(",")
            surface = unicode(sp_line2[0])
            pos = sp_line3[0]
            pos1 = sp_line3[1]
            base = sp_line3[6]
            m = Morph(surface,base,pos,pos1)
            morphs.append(m)
    return(sentenses)
Example #5
0
def chunk_list(target):
    i = 0
    morphs = [] #*区切りの形態素解析
    chunks = [] #Chunksのリスト
    sentences = []#EOS区切りのリスト

    for line in target:
        line = line.strip()
        
        if line[0] == '*':
            if len(morphs) > 0:
                chunks.append(Chunks(morphs, dst))
                morphs = []
            dst = int(line.split(' ')[2].rstrip('D'))

        elif line != 'EOS':
            morphs.append(Morph(line))

        else:
            chunks.append(Chunks(morphs, dst))
            for a, chunk in enumerate(chunks):
                if chunk.dst == -1 or chunk.dst == None:
                    continue
                chunks[chunk.dst].srcs.append(a)
            sentences.append(chunks)
            morphs = []
            chunks = []
            dst = None
        
        i += 1
        if i == 150: break

    return sentences
Example #6
0
def read_cabocha():
    i = 0
    j = 0
    sentences = []
    sentences.append([])
    for line in open("neko.txt.cabocha", "r"):
        line = line.strip("\n")
        if line == "EOS":
            i += 1
            j = 0
            sentences.append([])
        elif line[0] == "*":
            chunk = line.split()
            j = int(chunk[1])
            dst = int(chunk[2].strip("D"))
            srcs = []
            for k in range(0, j):
                if sentences[i][k].getDst() == j:
                    srcs.append(k)
            sentences[i].append(Chunk(dst, srcs))
        else:
            line = line.replace("\t", ",")
            cabocha = line.split(",")
            surface = cabocha[0]
            base = cabocha[7]
            pos = cabocha[1]
            pos1 = cabocha[2]
            sentences[i][j].appendMorphs(Morph(surface, base, pos, pos1))

    del sentences[-1]
    return sentences
Example #7
0
def cabocha_data():
    with open("neko.txt.cabocha", "r") as text:
        k_list = []  # 全文
        n_list = []  # 一文

        my_dict = defaultdict(list)  #リスト型の空の辞書ができる  # src
        for line in text:
            words = line.strip("\n").replace("\t", ",").split(",")
            if words[0].startswith("*"):
                words = words[0].split()
                index = int(words[1])
                kakari = int(words[2].strip("D"))
                my_dict[kakari].append(index)
                #係り先の番号に係り元のインデックスを入れる
                chunk = Chunk(kakari, my_dict[kakari])
                n_list.append(chunk)
            elif line.startswith('EOS'):
                if len(n_list) > 0:
                    k_list.append(n_list)
                n_list = []
                my_dict = defaultdict(list)  #初期化
            else:
                morphs = Morph(words[0], words[7], words[1], words[2])
                chunk.morphs.append(morphs)
    return k_list
Example #8
0
def load_chunk():
    with open(r'/Users/Naoya/Downloads/ai.ja.txt.parsed',
              encoding="utf-8") as f:
        chunks = {}
        srcs_dicts = {}
        for line in f:
            col = line.split('\t')
            if col[0] == 'EOS\n' and len(chunks) != 0:
                for key in srcs_dicts.keys():
                    chunks[key].srcs = srcs_dicts[key]
                yield chunks
                chunks = {}
                srcs_dicts = {}
            elif line[0] == '*':
                s_line = line.split(' ')
                dst = s_line[2].rstrip('D')
                id = s_line[1]
                if id not in chunks:
                    chunks[id] = Chunk()
                chunks[id].dst = dst
                if dst != str('-1'):
                    if dst not in srcs_dicts:
                        srcs_dicts[dst] = []
                    srcs_dicts[dst].append(id)
            elif col[0] != 'EOS\n' and line[0] != '*':
                other = col[1].split(',')
                chunks[id].morphs.append(
                    Morph(col[0], other[6], other[0], other[1]))
Example #9
0
def func41():
#    sentence_counter = 1
    sentence = list()
    store = defaultdict(list)
    with open('../data/neko.txt.cabocha','r') as neko:
        for line in neko:
            if line.startswith('*'):
                line.replace(',',' ')
                line.replace('\t',' ')
                element = line.rstrip('\n').split()
                ind = int(element[1])
                num = int(element[2].strip('D')) #dst:掛かり先のint
                store[num].append(ind) #store:掛かり元のリスト
                b = Chunk(num,ind,store[ind])
                sentence.append(b)

            elif line.startswith('EOS'):
                if len(sentence) > 0:
                    yield sentence
                sentence = list()
                store = defaultdict(list)
#            sentence_counter += 1

            else:
                word,morphs = line.rstrip('\n').split('\t')
                morphs = morphs.split(',')
#                line.replace(',',' ')
#                line.replace('\t',' ')
#                words = line.rstrip('\n').split()
#                if len(words) > 6:
#                print(1)
                a = Morph(word,morphs[6],morphs[0],morphs[1])
#                a = Morph(words[0],words[7],words[1],words[2])
                b.morphAppending(a)
Example #10
0
def get_chunk_list():
    '''
     「吾輩は猫である」を係り受け解析した結果の各単語に
     ・形態素(Morphオブジェクト)のリスト(morphs)
     ・係り先文節インデックス番号(dst)
     ・係り元文節インデックス番号のリスト(srct)
     をメンバとするクラスのを作り、
    一文をそのChunkオブジェクトのリストとして、

    戻り値: 一文ごとのChunkクラスを要素とするリストを要素にもつリスト
    '''
    with open('neko.cabocha', 'r') as neko_file:
        result = []
        chunks = {}  # Chunkオブジェクトのidx : Chunkオブジェクト
        for line in neko_file:
            line = line.rstrip()  #stripだと先頭の空白文字が消える

            # 係り受け情報の場合
            if line[0] == '*':

                idx = int(line.split(' ')[1])  # 文節番号
                dst = int(line.split(' ')[2][:-1])  # 係り先の文節番号(係り先なし:-1)

                # Chunkオブジェクトをchunksに追加
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst

                # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)

            # 形態素情報の場合
            elif line != 'EOS' and line[:1] != '*':
                # lineの列をそれぞれリストに格納
                columm = line.split('\t')[1].split(',')
                columm.insert(0, line.split('\t')[0])

                # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
                morpheme = {
                    'surface': columm[0],
                    'base': columm[7],
                    'pos': columm[1],
                    'pos1': columm[2]
                }
                chunks[idx].morphs.append(Morph(**morpheme))

            # 文の終わりの場合
            elif line == 'EOS' and len(chunks) != 0:
                sentence = [chunks[i] for i in sorted(chunks.keys())
                            ]  #keyでソートしてvalueを取り出す
                result.append(sentence[:])  #chunksで渡すと次の行でappendしたものが消えてしまう
                chunks.clear()

    return result
Example #11
0
def getChunk():
    class Chunk:
        def __init__(self, morphs, dst, srcs):
            self.morphs = morphs
            self.dst = dst
            self.srcs = srcs

        def isPos(self, pos):
            for x in self.morphs:
                if x.pos == pos:
                    return True
            return False

        def morph_str(self, morph):
            temp = ""
            for x in self.morphs:
                if (morph == "surface"):
                    temp += x.surface
                elif (morph == "base"):
                    temp += x.base
            return temp.strip("。").strip("、")

    morph_list = []
    chunks = []
    sentence = []
    morphs = []
    srcs = []
    dst = None
    for line in open("neko.txt.cabocha"):
        if line[0] != "*" and "EOS" not in line:
            words = re.split("\t|,", line)
            morph_obj = Morph(words[0], words[7], words[1], words[2])
            morphs.append(morph_obj)
            morph_list.append(morph_obj)
        else:
            if line[0] == "*":
                if dst != None:
                    chunks.append(Chunk(morphs, dst, srcs))
                    morphs = []
                    srcs = []
                    dst = None
                words = re.split("\s", line)
                dst = words[2][:-1]
                for i, c in enumerate(chunks[:int(words[1])]):
                    if c.dst == words[1]:
                        srcs.append(i)
            if morphs != []:
                chunks.append(Chunk(morphs, dst, srcs))
                morphs = []
                srcs = []
                dst = None
            if "EOS" in line:
                sentence.append(chunks)
                chunks = []
                i = 0
    return sentence
Example #12
0
def morph2chunk(morphlists):
    morphlists.append(Morph("*"))
    chunks = []
    chunk = []
    for elem in morphlists:
        #print(elem.show())
        if elem.show()[0] != "*" :
            chunk.append(elem)
        else:
            if chunk != []:
                chunks.append(Chunk(chunk))
            chunk = []
            chunk.append(elem)
    return chunks
Example #13
0
def get_chunk_list():
    with open('sentence.cabocha', 'r') as sentence_file:
        result = []
        chunks = {}  # Chunkオブジェクトのidx : Chunkオブジェクト
        for line in sentence_file:
            line = line.rstrip()  #stripだと先頭の空白文字が消える

            # 係り受け情報の場合
            if line[0] == '*':

                idx = int(line.split(' ')[1])  # 文節番号
                dst = int(line.split(' ')[2][:-1])  # 係り先の文節番号(係り先なし:-1)

                # Chunkオブジェクトをchunksに追加
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst

                # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)

            # 形態素情報の場合
            elif line != 'EOS' and line[:1] != '*':
                # lineの列をそれぞれリストに格納
                columm = line.split('\t')[1].split(',')
                columm.insert(0, line.split('\t')[0])

                # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
                morpheme = {
                    'surface': columm[0],
                    'base': columm[7],
                    'pos': columm[1],
                    'pos1': columm[2]
                }
                chunks[idx].morphs.append(Morph(**morpheme))

            # 文の終わりの場合
            elif line == 'EOS' and len(chunks) != 0:
                sentence = [chunks[i] for i in sorted(chunks.keys())
                            ]  #keyでソートしてvalueを取り出す
                result.append(sentence[:])  #chunksで渡すと次の行でappendしたものが消えてしまう
                chunks.clear()

    return result
Example #14
0
 def parse(lines) -> Sentence:
     chunks = []
     chunk = Chunk([], -1, [])
     srcs = defaultdict(lambda: [])
     for line in lines:
         if line[0] == '*':
             if len(chunk.morphs) != 0:
                 chunks.append(chunk)
             cur, dst, chunk = Chunk.parse(line)
             assert cur == len(chunks)
             if dst != -1:
                 srcs[dst].append(cur)
         else:
             chunk.add_morph(Morph.parse(line))
     if len(chunk.morphs) != 0:
         chunks.append(chunk)
     for dst, curs in srcs.items():
         chunks[dst].set_srcs(curs)
     return Sentence(chunks)
Example #15
0
def get_chunk_list(file_path: str):
    with open(file_path) as f:
        chunks = {}  # Chunkオブジェクトのidx : Chunkオブジェクト
        for line in f:
            line = line.rstrip()

            # 係り受け情報の場合
            if line[0] == '*':
                idx = int(line.split(' ')[1])  # 文節番号
                dst = int(line.split(' ')[2][:-1])  # 係り先の文節番号(係り先なし:-1)

                # Chunkオブジェクトをchunksに追加
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst

                # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)

            # 形態素情報の場合
            elif line != 'EOS' and line[:1] != '*':
                line = ','.join(line.split("\t")).split(',')

                # Morphクラスをリストに追加
                chunks[idx].morphs.append(
                    Morph(
                        line[0],  # surface
                        line[7],  # base
                        line[1],  # pos
                        line[2]  # pos1
                    ))

            # 文の終わりの場合
            elif line == 'EOS':
                if len(chunks) != 0:
                    yield [chunks[i] for i in sorted(chunks.keys())]
                else:
                    yield []
                chunks = {}
Example #16
0
def load_chunk(file_path):
    sentences = []
    chunks = []
    morphs = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line[0] == '*':  # * 0 -1D 1/1 0.000000
                if len(morphs) > 0:
                    chunks.append(Chunk(
                        morphs, dst))  # 直前の文節の情報に対して、Chunk()による結果を文節リストに追加
                    morphs = []
                dst = int(line.split(' ')[2].rstrip('D'))  # 直後の文節の係り先を取得
            elif line != 'EOS\n':
                morphs.append(Morph(line))  # Morph objectを返し、形態素リストに追加
            else:
                chunks.append(Chunk(morphs, dst))
                sentences.append(
                    Sentence(chunks))  #文節リストにSentence()を適用し、文リストに追加
                morphs = []
                chunks = []
                dst = int()
    return sentences
Example #17
0
def get_neko_list():
    sentence = list()
    src = defaultdict(list)
    for line in open('./neko.txt.cabocha'):
        if line.startswith('*'):
            ids = line.rstrip('\n').split()
            chunk_id = int(ids[1])
            dst = int(ids[2].rstrip('D'))
            src[dst].append(chunk_id)
            chunk = Chunk(dst, src[chunk_id])
            sentence.append(chunk)

        elif line.startswith('EOS'):
            if len(sentence) > 0:
                yield sentence
            sentence = list()
            src = defaultdict(list)

        else:
            surface, morphs = line.rstrip('\n').split('\t')
            morphs = morphs.split(',')
            morph = Morph(surface, morphs[0], morphs[1], morphs[6])
            chunk.append_morph(morph)
Example #18
0
def cabocha_chunk_data(data):
    pettern = re.compile(r'\* ')
    chunk_sentence = [];
    morphs = []; srcs = collections.defaultdict(lambda: [])
    for line in data:
        if pettern.match(line):
            if morphs != []:
                chunk_sentence.append(Chunk(morphs, dst, srcs[num_chunk]))
                morphs = [];
            temp = line.split()
            num_chunk = int(temp[1])
            dst = int(re.match(r'-*[0-9]+(?=D)', temp[2]).group())
            if dst != -1:
                srcs[dst].append(num_chunk)
        elif line == 'EOS\n':
            if chunk_sentence != [] or morphs != []:
                chunk_sentence.append(Chunk(morphs, dst, srcs[num_chunk]))
                yield chunk_sentence
                chunk_sentence = []
                morphs = []; srcs = collections.defaultdict(lambda: [])
        else:
            word = line.replace('\t', ',').split(',')
            morphs.append(Morph(word))
Example #19
0
def chunk_list(text):
    sentences = []
    sentence = []
    phrase = Chunk()
    for line in open(text, 'r'):
        line = line.split()
        if line[0] not in '*' and line[0] not in 'EOS':
            l = line[0].split(',') + line[1].split(',')
            morph = Morph(surface=l[0], base=l[7], pos=l[1], pos1=l[2])
            phrase.morphs.append(morph)
        else:
            if len(phrase.morphs) > 0:
                sentence.append(phrase)
                phrase = Chunk()
            if line[0] == '*':
                phrase.dst = int(line[2].strip('D'))
            elif line[0] == 'EOS':
                for no, s in enumerate(sentence, 0):  # dstからsrcsを設定
                    sentence[s.dst].srcs = no
                sentences.append(sentence)
                sentence = []

    return sentences
Example #20
0
def cabocha_data():
    with open('neko.txt.cabocha') as text:
        mlist = []
        Mlist = []
        src_dict = defaultdict(list)
        for line in text:
            word = line.strip('\n').replace('\t', ',').split(',')
            if word[0].startswith('*'):
                word = word[0].split()
                dst = int(word[2].strip('D'))
                chunk = Chunk(dst, src_dict[int(word[1])])
                src_dict[dst].append(int(word[1]))
                mlist.append(chunk)
            elif word[0] == 'EOS':
                src_dict = defaultdict(list)
                if len(mlist) > 0:
                    Mlist.append(mlist)
                    mlist = []
            else:
                morphs = Morph(word[0], word[7], word[1], word[2])
                chunk.morphs.append(morphs)

    return Mlist
Example #21
0
def read_file(file_path='./ai.ja.txt.parsed'):
    doc = []
    chunk = Chunk(None, None)
    with open(file_path) as fp:
        sentence = []
        for line in fp:
            line = line.strip()
            if line[0] == '*':
                if len(chunk.morphs) > 0:
                    sentence.append(chunk)
                # define chunk
                items = line.split()
                srcs = int(items[1])
                dst = int(items[2][:-1])
                chunk = Chunk(dst, srcs)

            elif line[:3] == 'EOS':
                if len(chunk.morphs) > 0:
                    sentence.append(chunk)
                if len(sentence) == 0:
                    continue
                chunk = Chunk(None, None)
                doc.append(sentence)
                sentence = []
            else:
                # define morph
                word_parts = line.split('\t')
                parts = word_parts[1].split(',')
                word = word_parts[0]
                surface = parts[6]
                pos = parts[0]
                pos1 = parts[1]
                morph = Morph(word, surface, pos, pos1)
                #sentence.append(morph)
                chunk.morphs.append(morph)
    return doc
Example #22
0
def func2():
    with open('neko.txt.cabocha', 'r') as f:
        m_tmp = list()
        c_tmp = list()
        chunk = Chunk()
        flag = False
        relate = defaultdict(lambda: list())
        for line in f:
            line = line.replace(' ', ',')
            line = line.replace('D', '')
            words = (line.replace('\t', ',')).strip().split(',')
            kts = Morph()
            if (words[0] != '*' and words[0] != 'EOS'):  #形態素解析をリストに加える
                kts.surface = words[0]
                kts.base = words[7]
                kts.pos = words[1]
                kts.pos1 = words[2]
                m_tmp.append(kts)

            elif (words[0] == 'EOS'):
                chunk.morphs = m_tmp  #形態素解析のデータを保存
                m_tmp = list()
                c_tmp.append(chunk)
                yield c_tmp
                relate = defaultdict(lambda: list())
                c_tmp = list()
                flag = False

            elif (words[0] == '*'):
                if flag:
                    chunk.morphs = m_tmp  #形態素解析のデータを保存
                    m_tmp = list()
                    c_tmp.append(chunk)
                    chunk = Chunk()
                flag = True
                chunk.dst = words[2]
                relate[words[2]].append(words[1])
                chunk.srcs = relate[words[1]]
                chunk.index = words[1]
Example #23
0
from collections import defaultdict
from knock40 import Morph, morph2sents
from knock41 import Chunk, morph2chunk, sources

if __name__ == "__main__":
    with open("ai.ja1.txt.parsed", "r") as ai:
        ai = ai.readlines()

    ai_morphs = []
    for i in range(len(ai)):
        ai_morphs.append(Morph(ai[i]))

    sents = morph2sents(ai_morphs)
    for sent in sents:
        chunks = morph2chunk(sent)
        sources(chunks)
        for i in range(len(chunks)):
            #print(chunk.meta)
            #print(chunk.show_bunsetsu_tag())
            if "サ変接続" in chunks[i].show_morph_pos1():
                if "を" in chunks[i].show_only_listwords():
                    print(chunks[i].show_only_words(), end="")
                    goto = chunks[i].dst
                    verb = chunks[goto].show_base_for_X("動詞")
                    adpos = set()
                    dep = []
                    if verb != None:
                        if chunks[goto].srcs != []:
                            for head_id in chunks[goto].srcs:
                                adp = chunks[head_id].show_base_for_X("助詞")
                                if adp != None and head_id != i:
Example #24
0

morphs = []
chunks = []  #文(Chunkオブジェクトのリスト)
sentences = []  #文のリスト
with open('ai.ja.txt.parsed') as file:
    for line in file:
        if line == '\n':
            continue

        #形態素をmorphsに追加
        if line != 'EOS\n' and line[0] != '*':
            line = line.replace('\n', '').split('\t')
            morph = [line[0]]
            morph.extend(line[1].split(','))
            morphs.append(Morph(morph))

        #EOSか係り受け関係の時、chunksに追加
        elif len(morphs) > 0:
            chunks.append(Chunk(morphs, dst))
            morphs = []

        #係り受け関係の時、dst更新
        if line[0] == '*':
            line = line.replace('\n', '').split()
            dst = int(line[2].strip('D'))

        #EOSで文節がある時、sentencesに追加
        if line == 'EOS\n' and len(chunks) > 0:
            for i, chunk in enumerate(chunks):
                if chunk.dst != -1:  #係り先が存在
Example #25
0
            chunk.dst = int(line[2][:-1])
        elif line == 'EOS':
            if lines[i - 1] == "EOS":
                continue
            elif len(chunk.morphs) > 0:
                sen.append(chunk)
                sentences.append(sen)
                sen = []
            chunk = Chunk()
        else:
            surface, info = line.split('\t')
            info = info.split(',')
            base = info[-3]
            pos = info[0]
            pos1 = info[1]
            morph = Morph(surface, base, pos, pos1)
            chunk.morphs.append(morph)
    #srcs
    for sen in sentences:
        for chunk in sen:
            dst = chunk.dst
            if not dst == -1:
                sen[dst].srcs.append(chunk.index)
if __name__ == "__main__":
    #out
    with open('chunks.txt', 'w') as f:
        for sen in sentences:
            print('------------', file=f)
            for chunk in sen:
                print(chunk, file=f)