def bnExtractDocSimilarity(doc1, doc2, similarity): """Measure the semantic similarity between two documents using Word Movers Distance. Uses Textacy API textacy.similarity.word_movers(doc1, doc2, metric=u'cosine') """ from textacy import similarity #if similarity == 'Word Movers': if similarity == 'cosine': # Metric can be cosine, euclidian, I1, I2, or manhattan s = similarity.word_movers(doc1, doc2, metric=u'cosine') print(" Cosine Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'Euclidian': s = similarity.word_movers(doc1, doc2, metric=u'euclidian') print(" Euclidian Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'Manhattan': s = similarity.word_movers(doc1, doc2, metric=u'manhattan') print(" Manhattan Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'word2vec': s = similarity.word2vec(doc1, doc2) print(" Semantic Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) else: # Unsupported similarity method s = 0 return round(s, 5)
def calculate(self): """ Calculate similarity using Word2Vec. :returns: dict in the shape of {id: [(similar post id, similarity score)]} """ similarity = {} for ref_post, comp_post in product(self.new_docs, self.all_docs): ref_id, ref_text = ref_post comp_id, comp_text = comp_post score = round(float(word2vec(ref_text, comp_text)), 2) if bool(self.threshold < score < 1): try: _ = similarity[ref_id] # noqa except KeyError: similarity[ref_id] = set() finally: similarity[ref_id].add((comp_id, round(score, 3))) return similarity
def test_word2vec_identity(doc1, doc2): assert similarity.word2vec(doc1, doc1) == pytest.approx(1.0, rel=1e-3)
def test_word2vec(doc1, doc2): pairs = ((doc1, doc2), (doc1[-2:], doc2[-2:])) for pair in pairs: assert 0.0 <= similarity.word2vec(pair[0], pair[1]) <= 1.0
def test_identity(self, doc_pairs): for doc1, doc2 in doc_pairs: assert similarity.word2vec(doc1, doc1) == pytest.approx(1.0, rel=1e-3) assert similarity.word2vec(doc2, doc2) == pytest.approx(1.0, rel=1e-3)
def test_default(self, doc_pairs): for doc1, doc2 in doc_pairs: assert 0.0 <= similarity.word2vec(doc1, doc2) <= 1.0