def NV_extract(self): NList = [] VList = [] for i in range(1, self.s.nrows): print i noenc = self.delete_unnecc(i) lan = Language(noenc) if len(lan.str) > 4000: continue word = lan.getMorpheme() tmpNoun = u"" NN = 0 for j, line in enumerate(lan.getMorpheme()): if line[1] == u"動詞": if line[7] == u"する": VList.append(word[j - 1][0] + line[7]) else: VList.append(line[7]) if line[1] == u"名詞" and j != len(word) - 1: if word[j + 1][1] == u"名詞": tmpNoun += line[0] NN += 1 if NN > 3: tmpNoun = u"" NN = 0 continue elif word[j + 1][7] != u"する": NList.append(tmpNoun + line[0]) tmpNoun = u"" NN = 0 return NList, VList
def extract_terms(self, case_df): Noun_comp = u"" wakachi = u"" preR_id = 1 terms = [] documents = [] for (Report_id, frame) in zip( case_df[u"報告書_id"], case_df. loc[:, [u"主体", u"起点", u"対象", u"状況", u"着点", u"手段", u"関係", u"動詞"]]. values): #if Report_id>100:break if preR_id != Report_id: documents.append(wakachi) print Report_id #print wakachi wakachi = u"" if frame[7][-2:] != u"する": wakachi += frame[7] + u" " if frame[7] not in terms: terms.append(frame[7]) else: wakachi += frame[7][:-2] + u" " if frame[7][:-2] not in terms: terms.append(frame[7][:-2]) for i in range(0, 7): if frame[i] == u' ': continue Lan = Language(frame[i]) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] # if (u"接続詞" in Mor_1) | (u"記号" in Mor_1): # continue for mi, Mor in enumerate(outList): if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹": Noun_comp += Mor[0] if mi < len(Mor_1) - 1: if Mor_1[mi + 1] != u"名詞": wakachi += Noun_comp + u" " if Noun_comp not in terms: terms.append(Noun_comp) Noun_comp = u"" else: wakachi += Noun_comp + u" " if Noun_comp not in terms: terms.append(Noun_comp) Noun_comp = u"" elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[ 5] != u"サ変・スル" and Mor[2] != u"接尾": wakachi += Mor[0] + u" " if Mor[0] not in terms: terms.append(Mor[0]) preR_id = Report_id documents.append(wakachi) return terms, documents
def to_class(self, Noun, Verb): if Noun in self.Nclass.keys(): Nclasslist = self.Nclass[Noun] else: lan = Language(Noun) word = lan.getMorpheme() Noun_tail = word[len(word)-1][0] if Noun_tail in self.Nclass.keys(): Nclasslist = self.Nclass[Noun_tail] else: Nclasslist = [u"未登録"] if Verb in self.Vclass.keys(): Vclasslist = self.Vclass[Verb] else: Vclasslist =[u"未登録"] #print Verb NV = [] for NVclass in itertools.product(Nclasslist,Vclasslist): NV.append(NVclass) return NV
def Section_div(self, case_df, VC_Dc, thresold_perD): Record_id = dict() # 文_id:レコード_id Record_id[(case_df.ix[0, :][u"報告書_id"], case_df.ix[0, :][u"文_id"])] = 0 tail_key = -1 for Report_id in case_df[u"報告書_id"].drop_duplicates(): print u"Extracting Sec_id:", Report_id Noun_pre = dict() # print Report_id case_df_perR = case_df[case_df[u"報告書_id"] == Report_id] for first_Sen, Sentence_id in enumerate(case_df_perR[u"文_id"].drop_duplicates()): for line in case_df_perR[case_df[u"文_id"] == Sentence_id].iterrows(): # print line[1][1] # print line[1][3] if line[1][1] not in Noun_pre.keys(): Noun_pre[line[1][1]] = [l for l in line[1][4:11].values if l != u" "] else: Noun_pre[line[1][1]] = Noun_pre[line[1][1]] + [l for l in line[1][4:11].values if l != u" "] # 代名詞の補完 for di, l in enumerate(line[1][4:11].values): if l != u" ": Lan = Language(l) outList = Lan.getMorpheme() if set([u"代名詞"]).intersection(set([outList[i][2] for i in range(len(outList))])): # ''' #代名詞の出力ベクトルと名詞の出力ベクトルのユークリッド距離が最小の名詞を選択 pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3]) if np.argmax(np.array(out_perD[1])) == di] if len(pronoun_vec) == 0: pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3])] Noun_out = [ {Np: [np.array(output[1]) for output in self.Dc.predict(Np, u"", line[1][3])] for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] if len(Noun_out[0]) == 0: del Noun_out[0] if len(Noun_out) == 0: break Neuclid_perS = [ {n: min([np.linalg.norm(pv - vec) for vec in No[n] for pv in pronoun_vec]) for n in No.keys()} for No in Noun_out] Neuclid_min_perS = [(perS.keys()[perS.values().index(min(perS.values()))], min(perS.values())) for perS in Neuclid_perS] toNoun = [N_ed[0] for N_ed in Neuclid_min_perS if min([nmp[1] for nmp in Neuclid_min_perS]) == N_ed[1]] case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun[0]) Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun[0] # ''' ''' #深層格における最大出力である名詞を選択 Noun_out = [{Np: max([output[1][di] for output in self.self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] #del Noun_out[0][l] if len(Noun_out[0])==0: del Noun_out[0] if len(Noun_out)==0: break MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in Noun_out] SSen_rec = MaxN_perS.index(max(MaxN_perS)) toNoun = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))] case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun) Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun ''' # 埋まっていない深層格(ゼロ代名詞)の補完 Deep_cand = [] for i in [VC_Dc[VC] for VC in self.Dc.NV_class[1][line[1][3]] if VC in VC_Dc]: Deep_cand += i Count_perD = [Deep_cand.count(d) for d in self.Dc.DeepCaseList] Dc_toV = [Deep_cor for Deep_cor in [d for d, Deep_cor in zip(self.Dc.DeepCaseList, Count_perD) if sum(Count_perD) / float(len(Count_perD)) < Deep_cor]] Noun_zero = dict() for Dc_tmp in Dc_toV: if line[1][Dc_tmp] == u" ": Noun_out = [{Np: max([output[1][self.Dc.DeepCaseList.index(Dc_tmp)] for output in self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] while {} in Noun_out: Noun_out.remove({}) if len(Noun_out) == 0: continue MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in Noun_out] SSen_rec = MaxN_perS.index(max(MaxN_perS)) if max(MaxN_perS) > thresold_perD[self.Dc.DeepCaseList.index(Dc_tmp)]: Noun_zero[Dc_tmp] = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))] if len(Noun_zero.keys()) > 0: case_zero = u"" for d, Noun_perD in zip(line[1][4:11].keys(), line[1][4:11].values): if d in Noun_zero.keys(): case_zero += u" " + Noun_zero[d] else: case_zero += u" " + Noun_perD case_zero += u" " + line[1][3] case_zero = re.sub(r" +", u" ", case_zero.strip()) case_df.ix[line[0], u"事象"] = case_zero for Noun_zero_tmp in Noun_zero.values(): Noun_pre[line[1][1]].append(Noun_zero_tmp) # 前の文に含まれる名詞が含まれているか if first_Sen == 0 and tail_key != -1: Record_id[(Report_id, Sentence_id)] = Record_id[tail_key] + 1 continue for pre_i in range(3, 0, -1): if line[1][1] - pre_i in Noun_pre.keys(): if set(Noun_pre[line[1][1]]).intersection(set(Noun_pre[line[1][1] - pre_i])): for pre_j in range(pre_i, -1, -1): if line[1][1] - pre_j in Noun_pre.keys(): Record_id[(Report_id, Sentence_id - pre_j)] = Record_id[ (Report_id, line[1][1] - pre_i)] break else: Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1 while (Report_id, Sentence_id) not in Record_id.keys(): pre_i += 1 if (Report_id, Sentence_id - pre_i) in Record_id.keys(): Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1 if first_Sen == len(case_df_perR[u"文_id"].drop_duplicates()) - 1: tail_key = (Report_id, Sentence_id) case_df[u"レコード_id"] = [(i, j) for i, j in zip(case_df[u"報告書_id"], case_df[u"文_id"])] case_df[u"レコード_id"] = case_df[u"レコード_id"].map(lambda x: Record_id[x]) return case_df
def bunrui_frame(self, case_df, terms, idf_Treport, dist_method, threshould_dist): MorList = [] Noun_comp = u"" Noun_weight = 2.0 idf_Treport = Series(idf_Treport) zero = min(idf_Treport) if zero==0: min_idf=1.0 for idf in idf_Treport: if idf<min_idf and idf!=zero: min_idf=idf idf_Treport[idf_Treport == 0] = min_idf*0.5 for frame in case_df[u"事象"].drop_duplicates().values: MorList_tmp = {} for i, words in enumerate(frame.split(u" ")): #print words, u":" if i==len(frame.split(u" "))-1: if words[-2:] != u"する": MorList_tmp[words] = idf_Treport[terms.index(words)] else: MorList_tmp[words[:-2]] = idf_Treport[terms.index(words[:-2])] #エラー回避用xm Noun_comp = u"" else: Lan = Language(words) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] for mi, Mor in enumerate(outList): if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹": Noun_comp += Mor[0] if mi < len(Mor_1) - 1: if Mor_1[mi + 1] != u"名詞": #print Noun_comp, MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight Noun_comp = u"" else: #print Noun_comp MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight Noun_comp = u"" elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[5] != u"サ変・スル" and Mor[2] != u"接尾": #print Mor[0] MorList_tmp[Mor[0]] = idf_Treport[terms.index(Mor[0])] MorList.append(MorList_tmp) #caseFrame = case_df[u"主体"] + u" " + case_df[u"起点"] + u" " + case_df[u"対象"] + u" " + case_df[u"状況"] + u" " + case_df[ # u"着点"] + u" " + case_df[u"手段"] + u" " + case_df[u"関係"] + u" " + case_df[u"動詞"] cf = [i for i in case_df[u"事象"].drop_duplicates()] #Jaccard係数による統一辞書の作成 Wdist_index = [] Wdist_column = [] Wdist = [] unifyList = {} Case_freq = Counter(case_df[u"事象"]) # 各文の動詞が反対語リストに含まれていれば分類しない oppositeList = [u"良好", u"正常", u"低下"] print len(cf), len(MorList) for i, x in enumerate(MorList): print u"calculating distance... ", i x_keys_set = set(x.keys()) for j, y in enumerate(MorList[i + 1:]): j = i + j + 1 y_keys_set = set(y.keys()) # print j if bool(set(oppositeList).intersection(x_keys_set)) and not(bool(y_keys_set.issuperset(set(oppositeList).intersection(x_keys_set)))): continue elif bool(set(oppositeList).intersection(y_keys_set)) and not(bool(x_keys_set.issuperset(set(oppositeList).intersection(y_keys_set)))): continue if (x.keys() in unifyList.keys()) | (y.keys() in unifyList.keys()): continue if bool(x_keys_set.intersection(y_keys_set)) and cf[i] != cf[j]: sym = x_keys_set.symmetric_difference(y_keys_set) if len(sym)>3: continue #排他的論理和形態素に名詞が(2つ以上)含まれていてはいけない(時間かかる) ''' Mor_sd = [(Language(sdm).getMorpheme().pop()[1], Language(sdm).getMorpheme().pop()[2]) for sdm in sym] for Mor_set in [(u"名詞", u"サ変接続"), (u"名詞", u"形容動詞語幹")]: while Mor_set in Mor_sd: Mor_sd.remove(Mor_set) if [ms[0] for ms in Mor_sd].count(u"名詞")<2: ''' xy_set = dict(x.items() + y.items()) xy_insec = x_keys_set.intersection(y_keys_set) w_all = 0.00 if dist_method == u"Jaccard": #jaccard係数 for mor_val in xy_set.values(): w_all += mor_val elif dist_method == u"Simpson": #Simpthon係数 if len(x.keys())<len(y.keys()): for mor_val in x.values(): w_all += mor_val else: for mor_val in y.values(): w_all += mor_val w_insec = 0.00 for mor_val in xy_insec: w_insec += xy_set[mor_val] dist_str = w_insec / w_all if dist_str >= threshould_dist: ''' if dist_method==u"Jaccard": #頻度が高い格フレームに統一 if Case_freq[cf[j]] <= Case_freq[cf[i]] and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] elif Case_freq[cf[j]] > Case_freq[cf[i]] and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif dist_method==u"Simpson": ''' #形態素数が少ない格フレームに統一 if len(x_keys_set) < len(y_keys_set) and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif len(x_keys_set) > len(y_keys_set) and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] elif len(cf[i])<len(cf[j]) and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif len(cf[i])>=len(cf[j]) and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] #print "%d:%s" % (i, cf[i]), "%d:%s" % (j, cf[j]), dist_str, w_insec, w_all #print outList[len(outList) - 1][0], outList[len(outList) - 1][1], outList[len(outList) - 1][2] Wdist_index.append(cf[i]) Wdist_column.append(cf[j]) Wdist.append(dist_str) Wdist = DataFrame(Wdist, index=[Wdist_index, Wdist_column], columns=[u"Similarity"]) fnc = lambda x: unifyList.get(x, x) insecset = set() while set(case_df[u"事象"]).intersection(set(unifyList.keys())): if insecset== set(case_df[u"事象"]).intersection(set(unifyList.keys())): break case_df[u"事象"] = case_df[u"事象"].map(fnc) insecset = set(case_df[u"事象"]).intersection(set(unifyList.keys())) return case_df, Wdist
def TNoun_extract(self, tripleFrame, NV_class): TW = TWordclass() unify_particle= lambda x: TW.Particle_to.get(x, x) tripleFrame[u"助詞"] = tripleFrame[u"助詞"].map(unify_particle) triple_Treport = [[] for i in range(len(tripleFrame.columns) + 1)] for R_id, S_id, V_id, noun, particle, verb in zip(tripleFrame[u"報告書_id"], tripleFrame[u"文_id"], tripleFrame[u"動詞_id"], tripleFrame[u"名詞"], tripleFrame[u"助詞"], tripleFrame[u"動詞"], ): #if id >300: break #print noun, particle, verb print "Extracting triple_Treport:", R_id, S_id, V_id Lan = Language(noun) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] Mor_2 = [outList[i][2] for i in range(len(outList))] noun_comp_tmp = u"" noun_comp = [] noun_tail = [] noun_Pos1 = [] noun_Pos2 = [] for mi, Pos in enumerate(Mor_1): if mi==len(Mor_1)-1: noun_comp.append(noun_comp_tmp + outList[mi][0]) noun_tail.append(outList[mi][0]) noun_Pos1.append(outList[mi][1]) noun_Pos2.append(outList[mi][2]) break if Pos==u"名詞": if Mor_1[mi+1]==u"名詞": noun_comp_tmp += outList[mi][0] else: noun_comp.append(noun_comp_tmp+outList[mi][0]) noun_tail.append(outList[mi][0]) noun_Pos1.append(outList[mi][1]) noun_Pos2.append(outList[mi][2]) noun_comp_tmp = u"" TNneed = False TVneed = False #トライボロジーに関係する名詞か判定 for cni, nounMor in enumerate(noun_comp): if noun_Pos2[cni] == u"代名詞" : TNneed = True break if nounMor in NV_class[0].keys(): noun_target = nounMor elif noun_tail[cni] in NV_class[0].keys(): noun_target = noun_tail[cni] else: continue for Nclass in NV_class[0][noun_target]: if Nclass in TW.TNounclass_all: TNneed = True break elif Nclass in TW.TNounclass_Nopart.keys(): TNneed = True for TNoun_Nopart in TW.TNounclass_Nopart[Nclass]: if TNoun_Nopart in noun_target: TNneed = False elif Nclass==u"様相" and noun_Pos2[cni]==u"形容動詞語幹": TNneed = False elif Nclass in TW.TNounclass_part.keys(): for TNoun_part in TW.TNounclass_part[Nclass]: if TNoun_part in noun_target: TNneed = True break else: continue #トライボロジーに関係する動詞か判定 if TNneed: if verb in NV_class[1].keys(): for Vclass in NV_class[1][verb]: if Vclass in TW.TVerbclass_all: TVneed = True break elif Vclass in TW.TVerbclass_Nopart.keys(): TVneed = True for TVerb_Nopart in TW.TVerbclass_Nopart[Vclass]: if TVerb_Nopart in verb: TVneed = False elif Vclass in TW.TVerbclass_part.keys(): for TVerb_part in TW.TVerbclass_part[Vclass]: if TVerb_part in verb: TVneed = True break else: continue if TNneed and TVneed: #並列している名詞の分解 Mor_connect = [[u"接続詞"],[u"読点", u"並立助詞", u"接続助詞"]] if set(Mor_connect[0]).intersection(set(Mor_1)) or set(Mor_connect[1]).intersection(set(Mor_2)): noun_con = u"" for oi, out in enumerate(outList): if outList[oi][1] not in Mor_connect[0] and outList[oi][2] not in Mor_connect[1]: if out[0]!=u"等": noun_con += out[0] else: #print out[0], noun_con triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun_con) triple_Treport[4].append(particle) triple_Treport[5].append(verb) noun_con = u"" continue if oi==len(outList)-1: triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun_con) triple_Treport[4].append(particle) triple_Treport[5].append(verb) noun_con = u"" else: triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun) triple_Treport[4].append(particle) triple_Treport[5].append(verb) triple_Treportdict = { tripleFrame.columns[0]: triple_Treport[0], tripleFrame.columns[1]: triple_Treport[1], tripleFrame.columns[2]: triple_Treport[2], tripleFrame.columns[3]: triple_Treport[3], tripleFrame.columns[4]: triple_Treport[4], tripleFrame.columns[5]: triple_Treport[5], } tripleFrame_Treport = DataFrame(triple_Treportdict, columns=[i for i in tripleFrame.columns]) fvu = lambda x: TW.Verb_unify.get(x, x) tripleFrame_Treport[u"動詞"] = tripleFrame_Treport[u"動詞"].map(fvu) return tripleFrame_Treport