コード例 #1
0
 def CreateLus(self):
     Ns = self.prep.numSpeakers
     Lus = np.zeros((len(self.segm.cleanSentences[self.íter]), Ns))
     for i in range(0, len(self.segm.cleanSentences[self.íter])):
         v1 = Help.CreateSentenceVector(
             self.segm.cleanSentences[self.íter][i], self.freqVec,
             self.prep.singleWords)
         for j in range(0, Ns):
             if j + 1 in self.segm.cleanSpeakers[self.íter]:
                 v2 = Help.CreateSpeakerVector(
                     j, self.segm.cleanSentences[self.íter],
                     self.segm.cleanSpeakers[self.íter], self.speakVec)
                 if Help.NotValidCos(v1, v2):
                     v1, v2 = Help.ReshapeVec(v1, v2)
                 cos_dist = 1 - sp.distance.cosine(v1, v2)
                 if math.isnan(cos_dist):
                     Lus[i][j] = 0.
                 else:
                     Lus[i][j] = cos_dist
     return Lus
コード例 #2
0
 def WI(self, w, s):    
     
     WI_vec = np.zeros(self.Ns)
     if w:
         suidf_win = Help.CreateSentenceVector(w, self.freq, self.prep.singleWords)
     
         den = 0 #doesn't have to be reset
         num = [] #append num per each speaker
         for j in range(self.Ns):
             num_t = 0 #numerator for given speaker
             for k in range(len(s)):
                 if s[k] == j+1:
                     num_t += suidf_win[k]
                     den += suidf_win[k]   
             num.append(num_t)
         
     
         for j in range(0, self.Ns):
             WI_vec[j] = Help.SafeDiv(num[j] , den)
     
     return WI_vec  
コード例 #3
0
    def CreateLuu(
        self,
        top=False,
        lex=True
    ):  # top=True means the function computes topical similarity, else lex=True computes lexical similarity

        Luu = np.zeros((len(self.segm.cleanSentences[self.íter]),
                        len(self.segm.cleanSentences[self.íter])
                        ))  # matrix [num_utterances X num_utterances]
        if (top and lex) or (
            (not top) and (not lex)
        ):  # if error in passing parameters (Luu can be based only on one kind of similarity)
            top = False  # reset default parameters
            lex = True  # reset default parameters

        if top:  #topic similarity

            prob_top_sent = np.zeros(
                (len(self.topicModel['Terms']),
                 len(self.segm.cleanSentences[self.íter])))
            for x in range(len(self.topicModel['Terms'])):
                for y in range(len(self.segm.cleanSentences[self.íter])):
                    num = 0
                    den = 0
                    for w in self.segm.cleanSentences[self.íter][y]:
                        #                    idx_w = find_index_word(w, corpus, tokens_topic_model) #ret -1 if w not in corpus

                        try:
                            tk_id = self.topicModel['Dictionary'].token2id[w]
                            num += (Help.FreqWordInSentence(
                                w, self.segm.cleanSentences[self.íter][y]) *
                                    self.topicModel['Terms'][x][tk_id])
                        except:
                            num += (Help.FreqWordInSentence(
                                w, self.segm.cleanSentences[self.íter][y]) *
                                    self.small)
                        den += Help.FreqWordInSentence(
                            w, self.segm.cleanSentences[self.íter][y])
                    prob_top_sent[x][y] = Help.SafeDiv(num, den)

            for x in range(len(self.segm.cleanSentences[self.íter])):
                for y in range(len(self.segm.cleanSentences[self.íter])):
                    LTS_sum = 0
                    prob = 0
                    for w in self.segm.cleanSentences[self.íter][y]:

                        wFreq = self.ComputeTermFrequency(
                            w
                        )  #creates a vector with the frequency of the word per each doc
                        if np.sum(
                                wFreq
                        ):  #if w doesn't appear in the dictionary, don't waste time
                            LTS_sum += self.CopmputeLTS(
                                wFreq
                            )  #return sum over all topics (LTS of a single word with frequency term_freq)
                    prob = Help.SumTopics(prob_top_sent,
                                          x)  #should I pass x or y?
                    Luu[x][y] = LTS_sum * prob

        else:  #lexical similarity
            for i in range(len(self.segm.cleanSentences[self.íter])):
                v1 = Help.CreateSentenceVector(
                    self.segm.cleanSentences[self.íter][i], self.freqVec,
                    self.prep.singleWords)
                for j in range(len(self.segm.cleanSentences[self.íter])):
                    v2 = Help.CreateSentenceVector(
                        self.segm.cleanSentences[self.íter][j], self.freqVec,
                        self.prep.singleWords)
                    if Help.NotValidCos(v1, v2):
                        v1, v2 = Help.ReshapeVec(v1, v2)
                        #if complains about v1, v2 dimensions, add zeros (or ones, idk yet, is a cosine distance) to match those dimensions
                    cos_sim = 1 - sp.distance.cosine(v1, v2)
                    if math.isnan(cos_sim):
                        Luu[i][j] = 0.
                    else:
                        Luu[i][j] = cos_sim

                    # cosine similarity only if vectors of same size

    #    return norm(Luu, norm='l1') #matrix representing lexical (topic) similarity via word overlap (via LDA)
        return Luu