Esempio n. 1
0
    def NV_extract(self):
        NList = []
        VList = []
        for i in range(1, self.s.nrows):
            print i
            noenc = self.delete_unnecc(i)
            lan = Language(noenc)
            if len(lan.str) > 4000:
                continue
            word = lan.getMorpheme()
            tmpNoun = u""
            NN = 0
            for j, line in enumerate(lan.getMorpheme()):
                if line[1] == u"動詞":
                    if line[7] == u"する":
                        VList.append(word[j - 1][0] + line[7])
                    else:
                        VList.append(line[7])

                if line[1] == u"名詞" and j != len(word) - 1:
                    if word[j + 1][1] == u"名詞":
                        tmpNoun += line[0]
                        NN += 1
                        if NN > 3:
                            tmpNoun = u""
                            NN = 0
                            continue
                    elif word[j + 1][7] != u"する":
                        NList.append(tmpNoun + line[0])
                        tmpNoun = u""
                        NN = 0
        return NList, VList
Esempio n. 2
0
    def extract_terms(self, case_df):
        Noun_comp = u""
        wakachi = u""
        preR_id = 1
        terms = []
        documents = []
        for (Report_id, frame) in zip(
                case_df[u"報告書_id"], case_df.
                loc[:,
                    [u"主体", u"起点", u"対象", u"状況", u"着点", u"手段", u"関係", u"動詞"]].
                values):
            #if Report_id>100:break
            if preR_id != Report_id:
                documents.append(wakachi)
                print Report_id
                #print wakachi
                wakachi = u""
            if frame[7][-2:] != u"する":
                wakachi += frame[7] + u" "
                if frame[7] not in terms: terms.append(frame[7])
            else:
                wakachi += frame[7][:-2] + u" "
                if frame[7][:-2] not in terms: terms.append(frame[7][:-2])
            for i in range(0, 7):
                if frame[i] == u' ':
                    continue
                Lan = Language(frame[i])
                outList = Lan.getMorpheme()
                Mor_1 = [outList[i][1] for i in range(len(outList))]
                # if (u"接続詞" in Mor_1) | (u"記号" in Mor_1):
                #    continue
                for mi, Mor in enumerate(outList):
                    if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹":
                        Noun_comp += Mor[0]
                        if mi < len(Mor_1) - 1:
                            if Mor_1[mi + 1] != u"名詞":
                                wakachi += Noun_comp + u" "
                                if Noun_comp not in terms:
                                    terms.append(Noun_comp)
                                Noun_comp = u""
                        else:
                            wakachi += Noun_comp + u" "
                            if Noun_comp not in terms: terms.append(Noun_comp)
                            Noun_comp = u""
                    elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[
                            5] != u"サ変・スル" and Mor[2] != u"接尾":
                        wakachi += Mor[0] + u" "
                        if Mor[0] not in terms: terms.append(Mor[0])

            preR_id = Report_id
        documents.append(wakachi)
        return terms, documents
Esempio n. 3
0
 def to_class(self, Noun, Verb):
     if Noun in self.Nclass.keys():
         Nclasslist = self.Nclass[Noun]
     else:
         lan = Language(Noun)
         word = lan.getMorpheme()
         Noun_tail = word[len(word)-1][0]
         if Noun_tail in self.Nclass.keys():
             Nclasslist = self.Nclass[Noun_tail]
         else:
             Nclasslist = [u"未登録"]
     if Verb in self.Vclass.keys():
         Vclasslist = self.Vclass[Verb]
     else:
         Vclasslist =[u"未登録"]
         #print Verb
     NV = []
     for NVclass in itertools.product(Nclasslist,Vclasslist):
         NV.append(NVclass)
     return NV
Esempio n. 4
0
    def Section_div(self, case_df, VC_Dc, thresold_perD):

        Record_id = dict()  # 文_id:レコード_id
        Record_id[(case_df.ix[0, :][u"報告書_id"], case_df.ix[0, :][u"文_id"])] = 0
        tail_key = -1
        for Report_id in case_df[u"報告書_id"].drop_duplicates():
            print u"Extracting Sec_id:", Report_id
            Noun_pre = dict()
            # print Report_id
            case_df_perR = case_df[case_df[u"報告書_id"] == Report_id]
            for first_Sen, Sentence_id in enumerate(case_df_perR[u"文_id"].drop_duplicates()):

                for line in case_df_perR[case_df[u"文_id"] == Sentence_id].iterrows():
                    # print line[1][1]
                    # print line[1][3]
                    if line[1][1] not in Noun_pre.keys():
                        Noun_pre[line[1][1]] = [l for l in line[1][4:11].values if l != u" "]
                    else:
                        Noun_pre[line[1][1]] = Noun_pre[line[1][1]] + [l for l in line[1][4:11].values if l != u" "]
                    # 代名詞の補完
                    for di, l in enumerate(line[1][4:11].values):
                        if l != u" ":
                            Lan = Language(l)
                            outList = Lan.getMorpheme()
                            if set([u"代名詞"]).intersection(set([outList[i][2] for i in range(len(outList))])):
                                # '''
                                #代名詞の出力ベクトルと名詞の出力ベクトルのユークリッド距離が最小の名詞を選択
                                pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3]) if
                                               np.argmax(np.array(out_perD[1])) == di]
                                if len(pronoun_vec) == 0:
                                    pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3])]
                                Noun_out = [
                                    {Np: [np.array(output[1]) for output in self.Dc.predict(Np, u"", line[1][3])] for Np in
                                     Np_list if Np not in line[1][4:11].values} for Np_list in
                                    [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if
                                     line[1][1] - pre_i in Noun_pre.keys()]]

                                if len(Noun_out[0]) == 0:
                                    del Noun_out[0]
                                if len(Noun_out) == 0:
                                    break

                                Neuclid_perS = [
                                    {n: min([np.linalg.norm(pv - vec) for vec in No[n] for pv in pronoun_vec]) for n in
                                     No.keys()} for No in Noun_out]
                                Neuclid_min_perS = [(perS.keys()[perS.values().index(min(perS.values()))], min(perS.values())) for perS in Neuclid_perS]
                                toNoun = [N_ed[0] for N_ed in Neuclid_min_perS if
                                          min([nmp[1] for nmp in Neuclid_min_perS]) == N_ed[1]]
                                case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun[0])
                                Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun[0]
                                # '''

                                '''
                                #深層格における最大出力である名詞を選択
                                Noun_out = [{Np: max([output[1][di] for output in self.self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if
                                                              line[1][1] - pre_i in Noun_pre.keys()]]
                                #del Noun_out[0][l]
                                if len(Noun_out[0])==0:
                                    del Noun_out[0]
                                if len(Noun_out)==0:
                                    break
                                MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in Noun_out]
                                SSen_rec = MaxN_perS.index(max(MaxN_perS))
                                toNoun = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))]
                                case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun)
                                Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun
                                '''
                    # 埋まっていない深層格(ゼロ代名詞)の補完
                    Deep_cand = []
                    for i in [VC_Dc[VC] for VC in self.Dc.NV_class[1][line[1][3]] if VC in VC_Dc]:
                        Deep_cand += i
                    Count_perD = [Deep_cand.count(d) for d in self.Dc.DeepCaseList]
                    Dc_toV = [Deep_cor for Deep_cor in
                              [d for d, Deep_cor in zip(self.Dc.DeepCaseList, Count_perD) if
                               sum(Count_perD) / float(len(Count_perD)) < Deep_cor]]

                    Noun_zero = dict()
                    for Dc_tmp in Dc_toV:
                        if line[1][Dc_tmp] == u" ":
                            Noun_out = [{Np: max([output[1][self.Dc.DeepCaseList.index(Dc_tmp)] for output in self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if
                                                    line[1][1] - pre_i in Noun_pre.keys()]]
                            while {} in Noun_out:
                                Noun_out.remove({})
                            if len(Noun_out) == 0:
                                continue
                            MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in
                                         Noun_out]
                            SSen_rec = MaxN_perS.index(max(MaxN_perS))
                            if max(MaxN_perS) > thresold_perD[self.Dc.DeepCaseList.index(Dc_tmp)]:
                                Noun_zero[Dc_tmp] = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))]
                    if len(Noun_zero.keys()) > 0:
                        case_zero = u""
                        for d, Noun_perD in zip(line[1][4:11].keys(), line[1][4:11].values):
                            if d in Noun_zero.keys():
                                case_zero += u" " + Noun_zero[d]
                            else:
                                case_zero += u" " + Noun_perD
                        case_zero += u" " + line[1][3]
                        case_zero = re.sub(r" +", u" ", case_zero.strip())

                        case_df.ix[line[0], u"事象"] = case_zero
                        for Noun_zero_tmp in Noun_zero.values():
                            Noun_pre[line[1][1]].append(Noun_zero_tmp)

                # 前の文に含まれる名詞が含まれているか
                if first_Sen == 0 and tail_key != -1:
                    Record_id[(Report_id, Sentence_id)] = Record_id[tail_key] + 1
                    continue
                for pre_i in range(3, 0, -1):
                    if line[1][1] - pre_i in Noun_pre.keys():
                        if set(Noun_pre[line[1][1]]).intersection(set(Noun_pre[line[1][1] - pre_i])):
                            for pre_j in range(pre_i, -1, -1):
                                if line[1][1] - pre_j in Noun_pre.keys():
                                    Record_id[(Report_id, Sentence_id - pre_j)] = Record_id[
                                        (Report_id, line[1][1] - pre_i)]
                            break
                        else:
                            Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1
                while (Report_id, Sentence_id) not in Record_id.keys():
                    pre_i += 1
                    if (Report_id, Sentence_id - pre_i) in Record_id.keys():
                        Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1
                if first_Sen == len(case_df_perR[u"文_id"].drop_duplicates()) - 1:
                    tail_key = (Report_id, Sentence_id)

        case_df[u"レコード_id"] = [(i, j) for i, j in zip(case_df[u"報告書_id"], case_df[u"文_id"])]
        case_df[u"レコード_id"] = case_df[u"レコード_id"].map(lambda x: Record_id[x])
        return case_df
Esempio n. 5
0
    def bunrui_frame(self, case_df, terms, idf_Treport, dist_method, threshould_dist):
        MorList = []
        Noun_comp = u""
        Noun_weight = 2.0
        idf_Treport = Series(idf_Treport)
        zero = min(idf_Treport)
        if zero==0:
            min_idf=1.0
            for idf in idf_Treport:
                if idf<min_idf and idf!=zero:
                    min_idf=idf
            idf_Treport[idf_Treport == 0] = min_idf*0.5


        for frame in case_df[u"事象"].drop_duplicates().values:
            MorList_tmp = {}
            for i, words in enumerate(frame.split(u" ")):
                #print words, u":"
                if i==len(frame.split(u" "))-1:
                    if words[-2:] != u"する":
                        MorList_tmp[words] = idf_Treport[terms.index(words)]
                    else:
                        MorList_tmp[words[:-2]] = idf_Treport[terms.index(words[:-2])]
                    #エラー回避用xm
                    Noun_comp = u""
                else:
                    Lan = Language(words)
                    outList = Lan.getMorpheme()
                    Mor_1 = [outList[i][1] for i in range(len(outList))]
                    for mi, Mor in enumerate(outList):
                        if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹":
                            Noun_comp += Mor[0]
                            if mi < len(Mor_1) - 1:
                                if Mor_1[mi + 1] != u"名詞":
                                    #print Noun_comp,
                                    MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight
                                    Noun_comp = u""

                            else:
                                #print Noun_comp
                                MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight
                                Noun_comp = u""
                        elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[5] != u"サ変・スル" and Mor[2] != u"接尾":
                            #print Mor[0]
                            MorList_tmp[Mor[0]] = idf_Treport[terms.index(Mor[0])]

            MorList.append(MorList_tmp)

        #caseFrame = case_df[u"主体"] + u" " + case_df[u"起点"] + u" " + case_df[u"対象"] + u" " + case_df[u"状況"] + u" " + case_df[
         #   u"着点"] + u" " + case_df[u"手段"] + u" " + case_df[u"関係"] + u" " + case_df[u"動詞"]
        cf = [i for i in case_df[u"事象"].drop_duplicates()]

        #Jaccard係数による統一辞書の作成
        Wdist_index = []
        Wdist_column = []
        Wdist = []
        unifyList = {}
        Case_freq = Counter(case_df[u"事象"])
        # 各文の動詞が反対語リストに含まれていれば分類しない
        oppositeList = [u"良好", u"正常", u"低下"]

        print len(cf), len(MorList)
        for i, x in enumerate(MorList):
            print u"calculating distance... ", i
            x_keys_set = set(x.keys())
            for j, y in enumerate(MorList[i + 1:]):
                j = i + j + 1
                y_keys_set = set(y.keys())
                # print j
                if bool(set(oppositeList).intersection(x_keys_set)) and not(bool(y_keys_set.issuperset(set(oppositeList).intersection(x_keys_set)))):
                    continue
                elif bool(set(oppositeList).intersection(y_keys_set)) and not(bool(x_keys_set.issuperset(set(oppositeList).intersection(y_keys_set)))):
                    continue

                if (x.keys() in unifyList.keys()) | (y.keys() in unifyList.keys()):
                    continue
                if bool(x_keys_set.intersection(y_keys_set)) and cf[i] != cf[j]:

                    sym = x_keys_set.symmetric_difference(y_keys_set)
                    if len(sym)>3:
                        continue
                    #排他的論理和形態素に名詞が(2つ以上)含まれていてはいけない(時間かかる)
                    '''
                    Mor_sd = [(Language(sdm).getMorpheme().pop()[1], Language(sdm).getMorpheme().pop()[2]) for sdm in sym]

                    for Mor_set in [(u"名詞", u"サ変接続"), (u"名詞", u"形容動詞語幹")]:
                        while Mor_set in Mor_sd:
                            Mor_sd.remove(Mor_set)

                    if [ms[0] for ms in Mor_sd].count(u"名詞")<2:
                    '''
                    xy_set = dict(x.items() + y.items())
                    xy_insec = x_keys_set.intersection(y_keys_set)
                    w_all = 0.00
                    if dist_method == u"Jaccard":
                        #jaccard係数
                        for mor_val in xy_set.values():
                            w_all += mor_val
                    elif dist_method == u"Simpson":
                        #Simpthon係数
                        if len(x.keys())<len(y.keys()):
                            for mor_val in x.values():
                                w_all += mor_val
                        else:
                            for mor_val in y.values():
                                w_all += mor_val


                    w_insec = 0.00
                    for mor_val in xy_insec:
                        w_insec += xy_set[mor_val]
                    dist_str = w_insec / w_all
                    if dist_str >= threshould_dist:
                        '''
                        if dist_method==u"Jaccard":
                            #頻度が高い格フレームに統一
                            if Case_freq[cf[j]] <= Case_freq[cf[i]] and cf[i] not in unifyList.keys():
                                unifyList[cf[i]] = cf[j]
                            elif Case_freq[cf[j]] > Case_freq[cf[i]] and cf[j] not in unifyList.keys():
                                unifyList[cf[j]] = cf[i]

                        elif dist_method==u"Simpson":
                        '''
                        #形態素数が少ない格フレームに統一
                        if len(x_keys_set) < len(y_keys_set) and cf[j] not in unifyList.keys():
                            unifyList[cf[j]] = cf[i]
                        elif len(x_keys_set) > len(y_keys_set) and cf[i] not in unifyList.keys():
                            unifyList[cf[i]] = cf[j]
                        elif len(cf[i])<len(cf[j]) and cf[j] not in unifyList.keys():
                            unifyList[cf[j]] = cf[i]
                        elif len(cf[i])>=len(cf[j]) and cf[i] not in unifyList.keys():
                            unifyList[cf[i]] = cf[j]
                        #print "%d:%s" % (i, cf[i]), "%d:%s" % (j, cf[j]), dist_str, w_insec, w_all
                        #print outList[len(outList) - 1][0], outList[len(outList) - 1][1], outList[len(outList) - 1][2]
                        Wdist_index.append(cf[i])
                        Wdist_column.append(cf[j])
                        Wdist.append(dist_str)

        Wdist = DataFrame(Wdist, index=[Wdist_index, Wdist_column], columns=[u"Similarity"])

        fnc = lambda x: unifyList.get(x, x)
        insecset = set()
        while set(case_df[u"事象"]).intersection(set(unifyList.keys())):
            if insecset== set(case_df[u"事象"]).intersection(set(unifyList.keys())):
                break
            case_df[u"事象"] = case_df[u"事象"].map(fnc)
            insecset = set(case_df[u"事象"]).intersection(set(unifyList.keys()))
        return case_df, Wdist
Esempio n. 6
0
    def TNoun_extract(self, tripleFrame, NV_class):
        TW = TWordclass()
        unify_particle= lambda x: TW.Particle_to.get(x, x)
        tripleFrame[u"助詞"] = tripleFrame[u"助詞"].map(unify_particle)
        triple_Treport = [[] for i in range(len(tripleFrame.columns) + 1)]
        for R_id, S_id, V_id, noun, particle, verb in zip(tripleFrame[u"報告書_id"], tripleFrame[u"文_id"], tripleFrame[u"動詞_id"],
                                                     tripleFrame[u"名詞"], tripleFrame[u"助詞"], tripleFrame[u"動詞"],
                                                     ):
            #if id >300: break
            #print noun, particle, verb
            print "Extracting triple_Treport:", R_id, S_id, V_id
            Lan = Language(noun)
            outList = Lan.getMorpheme()
            Mor_1 = [outList[i][1] for i in range(len(outList))]
            Mor_2 = [outList[i][2] for i in range(len(outList))]
            noun_comp_tmp = u""
            noun_comp = []
            noun_tail = []
            noun_Pos1 = []
            noun_Pos2 = []
            for mi, Pos in enumerate(Mor_1):
                if mi==len(Mor_1)-1:
                    noun_comp.append(noun_comp_tmp + outList[mi][0])
                    noun_tail.append(outList[mi][0])
                    noun_Pos1.append(outList[mi][1])
                    noun_Pos2.append(outList[mi][2])
                    break
                if Pos==u"名詞":
                    if Mor_1[mi+1]==u"名詞":
                        noun_comp_tmp += outList[mi][0]
                    else:
                        noun_comp.append(noun_comp_tmp+outList[mi][0])
                        noun_tail.append(outList[mi][0])
                        noun_Pos1.append(outList[mi][1])
                        noun_Pos2.append(outList[mi][2])
                        noun_comp_tmp = u""


            TNneed = False
            TVneed = False

            #トライボロジーに関係する名詞か判定
            for cni, nounMor in enumerate(noun_comp):
                if noun_Pos2[cni] == u"代名詞" :
                    TNneed = True
                    break
                if nounMor in NV_class[0].keys():
                    noun_target = nounMor
                elif noun_tail[cni] in NV_class[0].keys():
                    noun_target = noun_tail[cni]
                else:
                    continue

                for Nclass in NV_class[0][noun_target]:
                    if Nclass in TW.TNounclass_all:
                        TNneed = True
                        break
                    elif Nclass in TW.TNounclass_Nopart.keys():
                        TNneed = True
                        for TNoun_Nopart in TW.TNounclass_Nopart[Nclass]:
                            if TNoun_Nopart in noun_target:
                                TNneed = False

                            elif Nclass==u"様相" and noun_Pos2[cni]==u"形容動詞語幹":
                                TNneed = False


                    elif Nclass in TW.TNounclass_part.keys():
                        for TNoun_part in TW.TNounclass_part[Nclass]:
                            if TNoun_part in noun_target:
                                TNneed = True
                                break
                    else:
                        continue
            #トライボロジーに関係する動詞か判定
            if TNneed:
                if verb in NV_class[1].keys():
                    for Vclass in NV_class[1][verb]:
                        if Vclass in TW.TVerbclass_all:
                            TVneed = True
                            break
                        elif Vclass in TW.TVerbclass_Nopart.keys():
                            TVneed = True
                            for TVerb_Nopart in TW.TVerbclass_Nopart[Vclass]:
                                if TVerb_Nopart in verb:
                                    TVneed = False

                        elif Vclass in TW.TVerbclass_part.keys():
                            for TVerb_part in TW.TVerbclass_part[Vclass]:
                                if TVerb_part in verb:
                                    TVneed = True
                                    break
                        else:
                            continue

            if TNneed and TVneed:
                #並列している名詞の分解
                Mor_connect = [[u"接続詞"],[u"読点", u"並立助詞", u"接続助詞"]]
                if set(Mor_connect[0]).intersection(set(Mor_1)) or set(Mor_connect[1]).intersection(set(Mor_2)):
                    noun_con = u""
                    for oi, out in enumerate(outList):
                        if outList[oi][1] not in Mor_connect[0] and outList[oi][2] not in Mor_connect[1]:
                            if out[0]!=u"等":
                                noun_con += out[0]
                        else:
                            #print out[0], noun_con
                            triple_Treport[0].append(R_id)
                            triple_Treport[1].append(S_id)
                            triple_Treport[2].append(V_id)
                            triple_Treport[3].append(noun_con)
                            triple_Treport[4].append(particle)
                            triple_Treport[5].append(verb)
                            noun_con = u""
                            continue
                        if oi==len(outList)-1:
                            triple_Treport[0].append(R_id)
                            triple_Treport[1].append(S_id)
                            triple_Treport[2].append(V_id)
                            triple_Treport[3].append(noun_con)
                            triple_Treport[4].append(particle)
                            triple_Treport[5].append(verb)
                            noun_con = u""

                else:
                    triple_Treport[0].append(R_id)
                    triple_Treport[1].append(S_id)
                    triple_Treport[2].append(V_id)
                    triple_Treport[3].append(noun)
                    triple_Treport[4].append(particle)
                    triple_Treport[5].append(verb)

        triple_Treportdict = {
            tripleFrame.columns[0]: triple_Treport[0],
            tripleFrame.columns[1]: triple_Treport[1],
            tripleFrame.columns[2]: triple_Treport[2],
            tripleFrame.columns[3]: triple_Treport[3],
            tripleFrame.columns[4]: triple_Treport[4],
            tripleFrame.columns[5]: triple_Treport[5],

        }
        tripleFrame_Treport = DataFrame(triple_Treportdict,
                                        columns=[i for i in tripleFrame.columns])

        fvu = lambda x: TW.Verb_unify.get(x, x)
        tripleFrame_Treport[u"動詞"] = tripleFrame_Treport[u"動詞"].map(fvu)
        return tripleFrame_Treport