Esempio n. 1
0
    def Triple_extract(self, path):
            TR = Treport(path)
            triplelist = {}
            Mor_con = [[u"形容詞", u"助動詞", u"接続詞"], [u"連体化", u"並立助詞", u"読点", u"接続助詞"]]
            for i in range(1, TR.s.nrows):
                #if i>TR.s.nrows/2: break
                if i>10: break
                print i

                noenc = TR.delete_unnecc(i)
                #print TR.s.cell_value(i, 2).replace(u"-", u"")
                #print noenc
                for Sentence_id, perSen in enumerate(noenc.split(u"。")):
                   # TR.s.cell_value(i, 2)
                    Lan = Language(perSen)
                    cabocha_xml = Lan.cabocha_command()
                    chunkinfo, tokinfo, sentence_tok = Lan.chunk_structured(cabocha_xml)
                    #triple_perR = []
                    #id_perR = []
                    for chunk in chunkinfo:
                        compnoun_tail_id = -1
                        for tok_id, tokinfo_mor in enumerate(tokinfo[int(chunk[u"id"])]):
                            #print tok_id, compnoun_tail_id
                            if tok_id <= compnoun_tail_id:
                                continue
                            sentence_tok_set = sentence_tok[int(chunk[u"id"])]
                            if tokinfo_mor[0]==u"名詞":
                                Noun = sentence_tok_set[tok_id]
                                compnoun_tail_id = tok_id
                                for tok_id_noun in range(tok_id+1, len(tokinfo[int(chunk[u"id"])])-1):
                                    if tokinfo[int(chunk[u"id"])][tok_id_noun][0]==u"名詞" :
                                        if sentence_tok[int(chunk[u"id"])][tok_id_noun] == u"濃度":
                                            continue
                                        Noun += sentence_tok[int(chunk[u"id"])][tok_id_noun]
                                        compnoun_tail_id = tok_id_noun
                                    else:
                                        break

                                if compnoun_tail_id+1 == len(tokinfo[int(chunk[u"id"])]):
                                    continue
                                chunk_id_from = int(chunk[u"id"])


                                for i_from in reversed(range((int(chunk["id"])+1)*-1, 0)):
                                    if int(chunkinfo[int(chunk[u"id"])+i_from]["link"])==chunk_id_from:
                                        chunk_id_from -= 1
                                        from_tail_tok = tokinfo[int(chunk[u"id"])+i_from][len(tokinfo[int(chunk[u"id"])+i_from])-1]
                                        if from_tail_tok[0] in Mor_con[0] or from_tail_tok[1] in Mor_con[1]:
                                            for sentence_tok_from in reversed(list(sentence_tok[int(chunkinfo[int(chunk[u"id"])+i_from]["id"])])):
                                                Noun = sentence_tok_from + Noun
                                        else:
                                            break
                                    else:
                                        break


                                if tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][0]==u"助詞" and tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][1]!=u"接続助詞":
                                    Particle = tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][6]

                                    Noun_suru = u""
                                    for tok_id_link, tok_link_mor in enumerate(tokinfo[int(chunk[u"link"])]):
                                        if tok_link_mor[0]==u"名詞" and tok_link_mor[1]!=u"形容動詞語幹":
                                            Noun_suru += sentence_tok[int(chunk[u"link"])][tok_id_link]
                                            continue
                                        if tok_link_mor[0]==u"動詞" or tok_link_mor[0]==u"形容詞" or tok_link_mor[1]==u"形容動詞語幹":
                                            if tok_link_mor[1]!=u"末尾":
                                                Verb = u""
                                                if tok_link_mor[6]==u"する" or tok_link_mor[6]==u"できる":
                                                    Verb = Noun_suru+u"する"
                                                else:
                                                    Verb = tok_link_mor[6]

                                                Verb_id = int(chunk[u"link"])
                                                #数字以外を削除

                                                if isinstance(TR.s.cell_value(i, 2), float):
                                                    id_tuple = (TR.s.cell_value(i, 2), Sentence_id, Verb_id)
                                                else:
                                                    if re.search("[0-9]", TR.s.cell_value(i, 2)) is None:
                                                        id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)[re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)).end():]).group(0).replace(u"-", u""), Sentence_id, Verb_id)
                                                    else:
                                                        id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 2)).group(0).replace(u"-", u""), Sentence_id, Verb_id)

                                                if id_tuple not in triplelist.keys():
                                                    triplelist[id_tuple] = [(Noun, Particle, Verb)]
                                                else:
                                                    triple_tmp = triplelist[id_tuple]
                                                    triple_tmp.append((Noun, Particle, Verb))
                                                    triplelist[id_tuple] = triple_tmp
                                                #print Noun, Particle, Verb, TR.s.cell_value(i, 2).replace(u"-", u""), Sentence_id, Verb_id
                                                break

            return triplelist