def bnExtractDocSimilarity(doc1, doc2, similarity): """Measure the semantic similarity between two documents using Word Movers Distance. Uses Textacy API textacy.similarity.word_movers(doc1, doc2, metric=u'cosine') """ from textacy import similarity #if similarity == 'Word Movers': if similarity == 'cosine': # Metric can be cosine, euclidian, I1, I2, or manhattan s = similarity.word_movers(doc1, doc2, metric=u'cosine') print(" Cosine Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'Euclidian': s = similarity.word_movers(doc1, doc2, metric=u'euclidian') print(" Euclidian Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'Manhattan': s = similarity.word_movers(doc1, doc2, metric=u'manhattan') print(" Manhattan Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) elif similarity == 'word2vec': s = similarity.word2vec(doc1, doc2) print(" Semantic Similarity between docs {} and {} is: {}".format( \ bnGetDocName(doc1), bnGetDocName(doc2), s)) else: # Unsupported similarity method s = 0 return round(s, 5)
def test_identity(self, doc_pairs): for doc1, doc2 in doc_pairs[:2]: # HACK print(doc1, doc2) assert similarity.word_movers(doc1, doc1) == pytest.approx(1.0, rel=1e-3) assert similarity.word_movers(doc2, doc2) == pytest.approx(1.0, rel=1e-3)
def similar_ementas(ref, list): referencia = nlp(ref) for reference in list: yield (list.index(reference), similarity.word_movers(referencia, nlp(reference), metric='cosine'))
def similar(arquivo_referencia, lista_pets_escritorio): referencia = nlp(resumidor(arquivo_referencia)) for peticao_referencia in lista_pets_escritorio: yield (lista_pets_escritorio.index(peticao_referencia), similarity.word_movers(referencia, nlp(resumidor(peticao_referencia)), metric='canberra'))
def get_word_movers(text_1, text_2): """ Gets textacy word movers number from comparing two texts. Number between 0.0 to 1.0 where 0 is no similarity and 1 full similarity. :param text_1: First text :param text_2: Second text :return: Returns tuple with result of word movers and both processed texts """ doc_1, preprocess_text_1 = NLPService.get_textacy_doc(text_1) doc_2, preprocess_text_2 = NLPService.get_textacy_doc(text_2) word_mover = word_movers(doc_1, doc_2, metric="cosine") return word_mover, f"Zpracovaný text 1:\n{preprocess_text_1}\n\nZpracovaný text 2:\n{preprocess_text_2}"
def test_word_movers_identity(doc1, doc2): assert similarity.word_movers(doc1, doc1) == pytest.approx(1.0, rel=1e-3)
def test_word_movers_metrics(doc1, doc2): metrics = ("cosine", "l1", "manhattan", "l2", "euclidean") for metric in metrics: assert 0.0 <= similarity.word_movers(doc1, doc2, metric=metric) <= 1.0
def similar_tags(resumo_ref, tag_list): referencia = nlp(resumo_ref) for ref in tag_list: yield (tag_list.index(ref), similarity.word_movers(referencia, nlp(ref), metric='canberra'))
def score_summary_2(summary_text): """Score a summarized piece of text """ # Want high similarity between paragraphs inter_paragraph_similarities = [] avg_similarity = None sentences = [i.text for i in NLP(summary_text).sents] # readability measures close to ebert baseline readability = abs( text_stats.TextStats(NLP(summary_text)).automated_readability_index - EBERT_READABILITY) / EBERT_READABILITY # Coh Metrix Indices anaphor_score = anaphor_overlap(summary_text) person_score = person_overlap(summary_text) # more subjective is better total_subjectivity = 0 for i in sentences: total_subjectivity += TextBlob(i).sentiment[1] subjectivity = total_subjectivity / len(sentences) # thesis sentence doesn't have "this", "here", "it" if sentences[0] not in [' ', '', '\n']: thesis_penalty = sum(i in sentences[0] for i in [" this ", " This ", " here ", " Here"]) elif sentences[1] not in [' ', '', '\n']: thesis_penalty = sum(i in sentences[1] for i in [" this ", " This ", " here ", " Here"]) else: thesis_penalty = 0 # Prefer expressions from the author author_count = 0 for s in sentences: if any(i in s for i in ["I ", "I'd", "My"]): author_count += 1 # iterate through the paragraphs # sentiment within a paragraph is similar paragraphs = summary_text.split('\n') for i in range(1, len(paragraphs)): if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [ ' ', '', '\n' ]: inter_paragraph_similarities.append( similarity.word_movers(NLP(paragraphs[i - 1]), NLP(paragraphs[i]))) max_diff = 0 for p in paragraphs: p_sent_min = None p_sent_max = None for s in p.split('.'): sent = TextBlob(s).sentiment[0] if p_sent_min is None: p_sent_min = sent if p_sent_max is None: p_sent_max = sent if sent < p_sent_min: p_sent_min = sent if sent > p_sent_max: p_sent_max = sent if max_diff < abs(p_sent_max - p_sent_min): max_diff = abs(p_sent_max - p_sent_min) max_diff = 1 - max_diff avg_similarity = sum(inter_paragraph_similarities) / len( inter_paragraph_similarities) # Make score score = (0.25 * avg_similarity) + \ (0.20 * person_score) + \ (0.15 * anaphor_score) + \ (0.1 * max_diff) + \ (0.05 * readability) + \ (0.25 * subjectivity) # boost by person count score = score * (1 + (0.1 * author_count)) score = score - (0.2 * thesis_penalty) return score
def test_word_movers_metrics(doc1, doc2): metrics = ('cosine', 'l1', 'manhattan', 'l2', 'euclidean') for metric in metrics: assert 0.0 <= similarity.word_movers(doc1, doc2, metric=metric) <= 1.0