def get_similarity_values(q1_csc, q2_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] jaccard_dis = [] minkowsk_dis = [] for i, j in zip(q1_csc, q2_csc): sim = cs(i, j) cosine_sim.append(sim[0][0]) sim = md(i, j) manhattan_dis.append(sim[0][0]) sim = ed(i, j) eucledian_dis.append(sim[0][0]) i_ = i.toarray() j_ = j.toarray() try: sim = jsc(i_, j_) jaccard_dis.append(sim) except: jaccard_dis.append(0) sim = minkowski_dis.pairwise(i_, j_) minkowsk_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
def get_similarity_values(res_csc, jd_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] j= jd_csc for i in res_csc: sim = cs(i,j) cosine_sim.append(sim[0][0]) sim = md(i,j) manhattan_dis.append(sim[0][0]) sim = ed(i,j) eucledian_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis
def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim): assert dist_type in ['cs', 'ed', 'md'], 'dist type error' q1 = row['q1_w'].split() q2 = row['q2_w'].split() q1_bm25 = bm25_model.get_scores(q1, average_idf) q2_bm25 = bm25_model.get_scores(q2, average_idf) q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim)) q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim)) if dist_type == 'cs': score = cs(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'ed': score = ed(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'md': score = md(q1_bm25, q2_bm25).flatten()[0] return score
def extract_tfidf_feature(self, df): q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist()) q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist()) df['tfidf_cs'] = np.concatenate([ cs(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_ed'] = np.concatenate([ ed(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_md'] = np.concatenate([ md(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) corpus_tfidf = np.concatenate( [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0) svd_model = TruncatedSVD(n_components=5) svd_model.fit(corpus_tfidf) svd_topic = svd_model.transform(corpus_tfidf) q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]] q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:] df['svd_cs'] = np.concatenate([ cs(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_ed'] = np.concatenate([ ed(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_md'] = np.concatenate([ md(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) lda_model = LatentDirichletAllocation(n_components=5, random_state=0) lda_model.fit(corpus_tfidf) lda_topic = lda_model.transform(corpus_tfidf) q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]] q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:] df['lda_cs'] = np.concatenate([ cs(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_ed'] = np.concatenate([ ed(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_md'] = np.concatenate([ md(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ])