def CreateLus(self): Ns = self.prep.numSpeakers Lus = np.zeros((len(self.segm.cleanSentences[self.íter]), Ns)) for i in range(0, len(self.segm.cleanSentences[self.íter])): v1 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][i], self.freqVec, self.prep.singleWords) for j in range(0, Ns): if j + 1 in self.segm.cleanSpeakers[self.íter]: v2 = Help.CreateSpeakerVector( j, self.segm.cleanSentences[self.íter], self.segm.cleanSpeakers[self.íter], self.speakVec) if Help.NotValidCos(v1, v2): v1, v2 = Help.ReshapeVec(v1, v2) cos_dist = 1 - sp.distance.cosine(v1, v2) if math.isnan(cos_dist): Lus[i][j] = 0. else: Lus[i][j] = cos_dist return Lus
def WI(self, w, s): WI_vec = np.zeros(self.Ns) if w: suidf_win = Help.CreateSentenceVector(w, self.freq, self.prep.singleWords) den = 0 #doesn't have to be reset num = [] #append num per each speaker for j in range(self.Ns): num_t = 0 #numerator for given speaker for k in range(len(s)): if s[k] == j+1: num_t += suidf_win[k] den += suidf_win[k] num.append(num_t) for j in range(0, self.Ns): WI_vec[j] = Help.SafeDiv(num[j] , den) return WI_vec
def CreateLuu( self, top=False, lex=True ): # top=True means the function computes topical similarity, else lex=True computes lexical similarity Luu = np.zeros((len(self.segm.cleanSentences[self.íter]), len(self.segm.cleanSentences[self.íter]) )) # matrix [num_utterances X num_utterances] if (top and lex) or ( (not top) and (not lex) ): # if error in passing parameters (Luu can be based only on one kind of similarity) top = False # reset default parameters lex = True # reset default parameters if top: #topic similarity prob_top_sent = np.zeros( (len(self.topicModel['Terms']), len(self.segm.cleanSentences[self.íter]))) for x in range(len(self.topicModel['Terms'])): for y in range(len(self.segm.cleanSentences[self.íter])): num = 0 den = 0 for w in self.segm.cleanSentences[self.íter][y]: # idx_w = find_index_word(w, corpus, tokens_topic_model) #ret -1 if w not in corpus try: tk_id = self.topicModel['Dictionary'].token2id[w] num += (Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) * self.topicModel['Terms'][x][tk_id]) except: num += (Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) * self.small) den += Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) prob_top_sent[x][y] = Help.SafeDiv(num, den) for x in range(len(self.segm.cleanSentences[self.íter])): for y in range(len(self.segm.cleanSentences[self.íter])): LTS_sum = 0 prob = 0 for w in self.segm.cleanSentences[self.íter][y]: wFreq = self.ComputeTermFrequency( w ) #creates a vector with the frequency of the word per each doc if np.sum( wFreq ): #if w doesn't appear in the dictionary, don't waste time LTS_sum += self.CopmputeLTS( wFreq ) #return sum over all topics (LTS of a single word with frequency term_freq) prob = Help.SumTopics(prob_top_sent, x) #should I pass x or y? Luu[x][y] = LTS_sum * prob else: #lexical similarity for i in range(len(self.segm.cleanSentences[self.íter])): v1 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][i], self.freqVec, self.prep.singleWords) for j in range(len(self.segm.cleanSentences[self.íter])): v2 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][j], self.freqVec, self.prep.singleWords) if Help.NotValidCos(v1, v2): v1, v2 = Help.ReshapeVec(v1, v2) #if complains about v1, v2 dimensions, add zeros (or ones, idk yet, is a cosine distance) to match those dimensions cos_sim = 1 - sp.distance.cosine(v1, v2) if math.isnan(cos_sim): Luu[i][j] = 0. else: Luu[i][j] = cos_sim # cosine similarity only if vectors of same size # return norm(Luu, norm='l1') #matrix representing lexical (topic) similarity via word overlap (via LDA) return Luu