def Triple_extract(self, path): TR = Treport(path) triplelist = {} Mor_con = [[u"形容詞", u"助動詞", u"接続詞"], [u"連体化", u"並立助詞", u"読点", u"接続助詞"]] for i in range(1, TR.s.nrows): #if i>TR.s.nrows/2: break if i>10: break print i noenc = TR.delete_unnecc(i) #print TR.s.cell_value(i, 2).replace(u"-", u"") #print noenc for Sentence_id, perSen in enumerate(noenc.split(u"。")): # TR.s.cell_value(i, 2) Lan = Language(perSen) cabocha_xml = Lan.cabocha_command() chunkinfo, tokinfo, sentence_tok = Lan.chunk_structured(cabocha_xml) #triple_perR = [] #id_perR = [] for chunk in chunkinfo: compnoun_tail_id = -1 for tok_id, tokinfo_mor in enumerate(tokinfo[int(chunk[u"id"])]): #print tok_id, compnoun_tail_id if tok_id <= compnoun_tail_id: continue sentence_tok_set = sentence_tok[int(chunk[u"id"])] if tokinfo_mor[0]==u"名詞": Noun = sentence_tok_set[tok_id] compnoun_tail_id = tok_id for tok_id_noun in range(tok_id+1, len(tokinfo[int(chunk[u"id"])])-1): if tokinfo[int(chunk[u"id"])][tok_id_noun][0]==u"名詞" : if sentence_tok[int(chunk[u"id"])][tok_id_noun] == u"濃度": continue Noun += sentence_tok[int(chunk[u"id"])][tok_id_noun] compnoun_tail_id = tok_id_noun else: break if compnoun_tail_id+1 == len(tokinfo[int(chunk[u"id"])]): continue chunk_id_from = int(chunk[u"id"]) for i_from in reversed(range((int(chunk["id"])+1)*-1, 0)): if int(chunkinfo[int(chunk[u"id"])+i_from]["link"])==chunk_id_from: chunk_id_from -= 1 from_tail_tok = tokinfo[int(chunk[u"id"])+i_from][len(tokinfo[int(chunk[u"id"])+i_from])-1] if from_tail_tok[0] in Mor_con[0] or from_tail_tok[1] in Mor_con[1]: for sentence_tok_from in reversed(list(sentence_tok[int(chunkinfo[int(chunk[u"id"])+i_from]["id"])])): Noun = sentence_tok_from + Noun else: break else: break if tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][0]==u"助詞" and tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][1]!=u"接続助詞": Particle = tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][6] Noun_suru = u"" for tok_id_link, tok_link_mor in enumerate(tokinfo[int(chunk[u"link"])]): if tok_link_mor[0]==u"名詞" and tok_link_mor[1]!=u"形容動詞語幹": Noun_suru += sentence_tok[int(chunk[u"link"])][tok_id_link] continue if tok_link_mor[0]==u"動詞" or tok_link_mor[0]==u"形容詞" or tok_link_mor[1]==u"形容動詞語幹": if tok_link_mor[1]!=u"末尾": Verb = u"" if tok_link_mor[6]==u"する" or tok_link_mor[6]==u"できる": Verb = Noun_suru+u"する" else: Verb = tok_link_mor[6] Verb_id = int(chunk[u"link"]) #数字以外を削除 if isinstance(TR.s.cell_value(i, 2), float): id_tuple = (TR.s.cell_value(i, 2), Sentence_id, Verb_id) else: if re.search("[0-9]", TR.s.cell_value(i, 2)) is None: id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)[re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)).end():]).group(0).replace(u"-", u""), Sentence_id, Verb_id) else: id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 2)).group(0).replace(u"-", u""), Sentence_id, Verb_id) if id_tuple not in triplelist.keys(): triplelist[id_tuple] = [(Noun, Particle, Verb)] else: triple_tmp = triplelist[id_tuple] triple_tmp.append((Noun, Particle, Verb)) triplelist[id_tuple] = triple_tmp #print Noun, Particle, Verb, TR.s.cell_value(i, 2).replace(u"-", u""), Sentence_id, Verb_id break return triplelist