def calc(self): from numpy import append from scipy.spatial.distance import cosine if self.sim_type == 'lda_cosine': for i in range(len(self.key_list)-1): for j in range(i+1, len(self.key_list)): d = cosine(self.corpus[self.key_list[i]], self.corpus[self.key_list[j]]) out_tag = self.key_list[i] + '_' + self.key_list[j] self.calculated[out_tag] = d self.raw[out_tag] = append(self.corpus[self.key_list[i]],self.corpus[self.key_list[j]]) elif self.sim_type == 'word2vec_cosine': elif sim_type == 'kernel': from PyML import sequenceData docs = [self.corpus[key] for key in sorted(self.key_list)] kernel = sequenceData.spectrum_data(docs, k) mat = kernel.getKernelMatrix() for i in range(len(docs)): for j in range(i+1, len(docs)): tag = self.key_list[i] + '_' + self.key_list[j] self.calculated[tag] = mat[i][j] else: raise KeyError('Please check your similarity type!')
def string_kernel_sim(corpus, vals, k): docs = [corpus[i][1] for i in vals] kernel = sequenceData.spectrum_data(docs, k) mat = kernel.getKernelMatrix() sims = list(chain(*[[mat[i][j] for j in range(i+1, len(docs))] for i in range(len(docs))])) return sims