コード例 #1
0
def bnExtractDocSimilarity(doc1, doc2, similarity):
    """Measure the semantic similarity between two documents using
       Word Movers Distance. Uses Textacy API
       textacy.similarity.word_movers(doc1, doc2, metric=u'cosine')
    """

    from textacy import similarity
    #if similarity == 'Word Movers':
    if similarity == 'cosine':
        # Metric can be cosine, euclidian, I1, I2, or manhattan
        s = similarity.word_movers(doc1, doc2, metric=u'cosine')
        print(" Cosine Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'Euclidian':
        s = similarity.word_movers(doc1, doc2, metric=u'euclidian')
        print(" Euclidian Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'Manhattan':
        s = similarity.word_movers(doc1, doc2, metric=u'manhattan')
        print(" Manhattan Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'word2vec':
        s = similarity.word2vec(doc1, doc2)
        print(" Semantic Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    else:
        # Unsupported similarity method
        s = 0

    return round(s, 5)
コード例 #2
0
 def test_identity(self, doc_pairs):
     for doc1, doc2 in doc_pairs[:2]:  # HACK
         print(doc1, doc2)
         assert similarity.word_movers(doc1,
                                       doc1) == pytest.approx(1.0, rel=1e-3)
         assert similarity.word_movers(doc2,
                                       doc2) == pytest.approx(1.0, rel=1e-3)
コード例 #3
0
def similar_ementas(ref, list):
    referencia = nlp(ref)
    for reference in list:
        yield (list.index(reference),
               similarity.word_movers(referencia,
                                      nlp(reference),
                                      metric='cosine'))
コード例 #4
0
def similar(arquivo_referencia, lista_pets_escritorio):
    referencia = nlp(resumidor(arquivo_referencia))
    for peticao_referencia in lista_pets_escritorio:
        yield (lista_pets_escritorio.index(peticao_referencia),
               similarity.word_movers(referencia,
                                      nlp(resumidor(peticao_referencia)),
                                      metric='canberra'))
コード例 #5
0
    def get_word_movers(text_1, text_2):
        """
        Gets textacy word movers number from comparing two texts.
        Number between 0.0 to 1.0 where 0 is no similarity and 1 full similarity.
        :param text_1: First text
        :param text_2: Second text
        :return: Returns tuple with result of word movers and both processed texts
        """
        doc_1, preprocess_text_1 = NLPService.get_textacy_doc(text_1)
        doc_2, preprocess_text_2 = NLPService.get_textacy_doc(text_2)

        word_mover = word_movers(doc_1, doc_2, metric="cosine")

        return word_mover, f"Zpracovaný text 1:\n{preprocess_text_1}\n\nZpracovaný text 2:\n{preprocess_text_2}"
コード例 #6
0
ファイル: test_similarity.py プロジェクト: zf109/textacy
def test_word_movers_identity(doc1, doc2):
    assert similarity.word_movers(doc1, doc1) == pytest.approx(1.0, rel=1e-3)
コード例 #7
0
ファイル: test_similarity.py プロジェクト: zf109/textacy
def test_word_movers_metrics(doc1, doc2):
    metrics = ("cosine", "l1", "manhattan", "l2", "euclidean")
    for metric in metrics:
        assert 0.0 <= similarity.word_movers(doc1, doc2, metric=metric) <= 1.0
コード例 #8
0
def similar_tags(resumo_ref, tag_list):
    referencia = nlp(resumo_ref)
    for ref in tag_list:
        yield (tag_list.index(ref),
               similarity.word_movers(referencia, nlp(ref), metric='canberra'))
コード例 #9
0
def score_summary_2(summary_text):
    """Score a summarized piece of text
    """
    # Want high similarity between paragraphs
    inter_paragraph_similarities = []
    avg_similarity = None

    sentences = [i.text for i in NLP(summary_text).sents]

    # readability measures close to ebert baseline
    readability = abs(
        text_stats.TextStats(NLP(summary_text)).automated_readability_index -
        EBERT_READABILITY) / EBERT_READABILITY

    # Coh Metrix Indices
    anaphor_score = anaphor_overlap(summary_text)
    person_score = person_overlap(summary_text)

    # more subjective is better
    total_subjectivity = 0
    for i in sentences:
        total_subjectivity += TextBlob(i).sentiment[1]
    subjectivity = total_subjectivity / len(sentences)

    # thesis sentence doesn't have "this", "here", "it"
    if sentences[0] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[0]
                             for i in [" this ", " This ", " here ", " Here"])
    elif sentences[1] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[1]
                             for i in [" this ", " This ", " here ", " Here"])
    else:
        thesis_penalty = 0

    # Prefer expressions from the author
    author_count = 0
    for s in sentences:
        if any(i in s for i in ["I ", "I'd", "My"]):
            author_count += 1

    # iterate through the paragraphs
    # sentiment within a paragraph is similar
    paragraphs = summary_text.split('\n')
    for i in range(1, len(paragraphs)):
        if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [
                ' ', '', '\n'
        ]:
            inter_paragraph_similarities.append(
                similarity.word_movers(NLP(paragraphs[i - 1]),
                                       NLP(paragraphs[i])))

    max_diff = 0
    for p in paragraphs:
        p_sent_min = None
        p_sent_max = None
        for s in p.split('.'):
            sent = TextBlob(s).sentiment[0]
            if p_sent_min is None:
                p_sent_min = sent
            if p_sent_max is None:
                p_sent_max = sent

            if sent < p_sent_min:
                p_sent_min = sent
            if sent > p_sent_max:
                p_sent_max = sent
        if max_diff < abs(p_sent_max - p_sent_min):
            max_diff = abs(p_sent_max - p_sent_min)
    max_diff = 1 - max_diff
    avg_similarity = sum(inter_paragraph_similarities) / len(
        inter_paragraph_similarities)

    # Make score
    score = (0.25 * avg_similarity) + \
            (0.20 * person_score) + \
            (0.15 * anaphor_score) + \
            (0.1 * max_diff) + \
            (0.05 * readability) + \
            (0.25 * subjectivity)
    # boost by person count
    score = score * (1 + (0.1 * author_count))
    score = score - (0.2 * thesis_penalty)

    return score
コード例 #10
0
def test_word_movers_metrics(doc1, doc2):
    metrics = ('cosine', 'l1', 'manhattan', 'l2', 'euclidean')
    for metric in metrics:
        assert 0.0 <= similarity.word_movers(doc1, doc2, metric=metric) <= 1.0