def get_similarity_values(q1_csc, q2_csc):
            cosine_sim = []
            manhattan_dis = []
            eucledian_dis = []
            jaccard_dis = []
            minkowsk_dis = []

            for i, j in zip(q1_csc, q2_csc):
                sim = cs(i, j)
                cosine_sim.append(sim[0][0])
                sim = md(i, j)
                manhattan_dis.append(sim[0][0])
                sim = ed(i, j)
                eucledian_dis.append(sim[0][0])
                i_ = i.toarray()
                j_ = j.toarray()
                try:
                    sim = jsc(i_, j_)
                    jaccard_dis.append(sim)
                except:
                    jaccard_dis.append(0)

                sim = minkowski_dis.pairwise(i_, j_)
                minkowsk_dis.append(sim[0][0])
            return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
Exemple #2
0
def get_similarity_values(res_csc, jd_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    
    j= jd_csc
    for i in res_csc:
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        
    return cosine_sim, manhattan_dis, eucledian_dis  
        def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim):
            assert dist_type in ['cs', 'ed', 'md'], 'dist type error'
            q1 = row['q1_w'].split()
            q2 = row['q2_w'].split()
            q1_bm25 = bm25_model.get_scores(q1, average_idf)
            q2_bm25 = bm25_model.get_scores(q2, average_idf)
            q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim))
            q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim))

            if dist_type == 'cs':
                score = cs(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'ed':
                score = ed(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'md':
                score = md(q1_bm25, q2_bm25).flatten()[0]
            return score
    def extract_tfidf_feature(self, df):
        q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist())
        q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist())

        df['tfidf_cs'] = np.concatenate([
            cs(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_ed'] = np.concatenate([
            ed(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_md'] = np.concatenate([
            md(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])

        corpus_tfidf = np.concatenate(
            [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0)

        svd_model = TruncatedSVD(n_components=5)
        svd_model.fit(corpus_tfidf)

        svd_topic = svd_model.transform(corpus_tfidf)
        q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]]
        q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:]

        df['svd_cs'] = np.concatenate([
            cs(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_ed'] = np.concatenate([
            ed(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_md'] = np.concatenate([
            md(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])

        lda_model = LatentDirichletAllocation(n_components=5, random_state=0)
        lda_model.fit(corpus_tfidf)

        lda_topic = lda_model.transform(corpus_tfidf)

        q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]]
        q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:]

        df['lda_cs'] = np.concatenate([
            cs(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_ed'] = np.concatenate([
            ed(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_md'] = np.concatenate([
            md(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])