def test_compute_ratings():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)

    s0 = Sentence("Dog cat fish.", Tokenizer("english"))
    s1 = Sentence("Dog cat camel.", Tokenizer("english"))
    s2 = Sentence("Fish frog horse.", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1

    # Due to the frequency discounting, after finding sentence s0,
    # s2 should come before s1 since only two of its words get discounted
    # rather than all 3 of s1's
    s0 = Sentence("one two three", Tokenizer("english"))
    s1 = Sentence("one two four", Tokenizer("english"))
    s2 = Sentence("three five six", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1
Ejemplo n.º 2
0
def test_the_sentences_should_be_in_different_order(summarizer):
    """https://github.com/miso-belica/sumy/issues/146"""
    paragraphs = [
        ["This is 1st sentence.", "This is 2nd sentence."],
        ["This is 3rd sentence.", "This is 4th sentence."],
        ["This is 5th sentence."],
    ]
    document = build_document(*paragraphs)
    reversed_document = build_document(*(reversed(p)
                                         for p in reversed(paragraphs)))

    sentences = summarizer(document, "100%")
    reversed_sentences = summarizer(reversed_document, "100%")

    assert tuple(reversed(sentences)) == reversed_sentences
Ejemplo n.º 3
0
def test_cue_3():
    document = build_document(
        (
            "ba "*10,
            "bb "*10,
            " sa"*8 + " bb"*10,
            "bb bc ba",
        ),
        (),
        (
            "babbbc "*10,
            "na nb nc nd sa" + " bc"*10,
            " ba n"*10,
        )
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)
    summarizer.stigma_words = ("sa", "sb", "sc",)

    sentences = summarizer.cue_method(document, 5)

    assert list(map(to_unicode, sentences)) == [
        ("ba "*10).strip(),
        ("bb "*10).strip(),
        "bb bc ba",
        "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc",
        ("ba n "*10).strip(),
    ]
Ejemplo n.º 4
0
def test_location_method_with_empty_document():
    summarizer = EdmundsonSummarizer()
    summarizer.null_words = ("na", "nb", "nc",)

    sentences = summarizer.location_method(build_document(), 10)

    assert list(map(to_unicode, sentences)) == []
Ejemplo n.º 5
0
def test_key_empty():
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)

    sentences = summarizer.key_method(build_document(), 10)

    assert list(map(to_unicode, sentences)) == []
def test_three_sentences():
    document = build_document((
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("s", )

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]
Ejemplo n.º 7
0
def test_single_sentence(summarizer):
    s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
    document = build_document([s])

    returned = summarizer(document, 10)

    assert len(returned) == 1
def test_various_words_with_significant_percentage():
    document = build_document((
        "1 a",
        "2 b b",
        "3 c c c",
        "4 d d d",
        "5 z z z z",
        "6 e e e e e",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "6 e e e e e",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "5 z z z z",
        "6 e e e e e",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "3 c c c",
        "5 z z z z",
        "6 e e e e e",
    ]
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Ejemplo n.º 10
0
def test_numpy_not_installed():
    summarizer = LsaSummarizer()

    numpy = lsa_module.numpy
    lsa_module.numpy = None

    with pytest.raises(ValueError):
        summarizer(build_document(), 10)

    lsa_module.numpy = numpy
def test_numpy_not_installed():
    summarizer = LexRankSummarizer()

    numpy = lex_rank_module.numpy
    lex_rank_module.numpy = None

    with pytest.raises(ValueError):
        summarizer(build_document(), 10)

    lex_rank_module.numpy = numpy
def test_single_sentence():
    document = build_document(("Já jsem jedna věta", ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = (
        "já",
        "jsem",
    )

    returned = summarizer(document, 10)
    assert len(returned) == 1
Ejemplo n.º 13
0
def test_single_sentence():
    document = build_document(("I am one sentence", ))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = (
        "I",
        "am",
    )

    returned = summarizer(document, 10)
    assert len(returned) == 1
def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
Ejemplo n.º 15
0
def test_cue_1():
    document = build_document(
        ("ba bb bc bb unknown ľščťžýáíé sb sc sb",)
    )

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)
    summarizer.stigma_words = ("sa", "sb", "sc",)

    sentences = summarizer.cue_method(document, 10)

    assert len(sentences) == 1
def test_compute_tf():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)
    s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
    s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
    document = build_document([s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 1 / 6
    assert freq["soccer"] == 1 / 6
    assert freq["balls"] == 1 / 6
    assert freq["eating"] == 1 / 6
    assert freq["chicken"] == 1 / 6
    assert freq["dumplings"] == 1 / 6

    document = build_document([s0, s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 2 / 9
    assert freq["soccer"] == 2 / 9
    assert freq["balls"] == 2 / 9
    assert freq["eating"] == 1 / 9
    assert freq["chicken"] == 1 / 9
    assert freq["dumplings"] == 1 / 9
Ejemplo n.º 17
0
def test_key_1():
    document = build_document(
        ("wa wb wc wd", "I like music",),
        ("This is test sentence with some extra words and bonus",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

    sentences = summarizer.key_method(document, 1)

    assert list(map(to_unicode, sentences)) == [
        "This is test sentence with some extra words and bonus",
    ]
def test_get_all_content_words_in_doc():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)
    s0 = Sentence("One two three.", Tokenizer("english"))
    s1 = Sentence("One two three.", Tokenizer("english"))
    document = build_document([s0, s1])

    content_words = summarizer._get_all_content_words_in_doc(
        document.sentences)
    content_words_freq = {}
    for w in content_words:
        content_words_freq[w] = content_words_freq.get(w, 0) + 1
    content_words_correct = {"one": 2, "two": 2, "three": 2}
    assert content_words_freq == content_words_correct
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    ratings = summarizer.rate_sentences(document)
    assert len(ratings) == 3
    assert ratings[document.sentences[1]] > ratings[document.sentences[0]]
    assert ratings[document.sentences[0]] > ratings[document.sentences[2]]
Ejemplo n.º 20
0
def test_dictionary_without_stop_words():
    summarizer = LsaSummarizer()
    summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

    document = build_document(
        ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
        ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
        ("Some relevant sentence", "Some moRe releVant sentEnce",),
    )

    expected = frozenset(["some", "more", "relevant", "sentence"])
    dictionary = summarizer._create_dictionary(document)

    assert expected == frozenset(dictionary.keys())
Ejemplo n.º 21
0
def test_key_2():
    document = build_document(
        ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
        ("This is bonus test sentence with some extra words and bonus",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("nom", "bonus",)

    sentences = summarizer.key_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "Om nom nom nom nom",
        "This is bonus test sentence with some extra words and bonus",
    ]
Ejemplo n.º 22
0
def test_document():
    document = build_document(
        ("I am the sentence you like", "Do you like me too",),
        ("This sentence is better than that above", "Are you kidding me",)
    )
    summarizer = LsaSummarizer()
    summarizer.stopwords = (
        "I", "am", "the", "you", "are", "me", "is", "than", "that", "this",
    )

    sentences = summarizer(document, 2)
    assert len(sentences) == 2
    assert to_unicode(sentences[0]) == "I am the sentence you like"
    assert to_unicode(sentences[1]) == "This sentence is better than that above"
def test_two_sentences_but_one_winner():
    document = build_document(
        ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta"))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = (
        "já",
        "jsem",
        "a",
        "ta",
    )

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "A já ta 2. vítězná věta",
    ]
Ejemplo n.º 24
0
def test_two_sentences():
    document = build_document(
        ("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = (
        "I",
        "am",
        "and",
        "that",
    )

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
Ejemplo n.º 25
0
def test_title_method_without_title():
    document = build_document(
        ("This is sentence", "This is another one",),
        ("And some next sentence but no heading",)
    )

    summarizer = EdmundsonSummarizer()
    summarizer.null_words = ("this", "is", "some", "and",)

    sentences = summarizer.title_method(document, 10)
    assert list(map(to_unicode, sentences)) == [
        "This is sentence",
        "This is another one",
        "And some next sentence but no heading",
    ]
Ejemplo n.º 26
0
def test_rating_with_zero_or_single_words_in_sentences(sentences,
                                                       expected_ratings):
    """
    This is an edge-case test when the sentence(s) have only one word or even zero words.
    This test makes me sure the logic will not break when such a case is encountered.
    """
    document = build_document(sentences)
    summarizer = TextRankSummarizer()

    ratings = summarizer.rate_sentences(document)

    assert ratings == {
        document.sentences[0]: pytest.approx(expected_ratings[0]),
        document.sentences[1]: pytest.approx(expected_ratings[1]),
    }
Ejemplo n.º 27
0
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = TextRankSummarizer()

    ratings = summarizer.rate_sentences(document)

    assert ratings == {
        document.sentences[0]: pytest.approx(0.29714368215098025),
        document.sentences[1]: pytest.approx(0.42683373199392705),
        document.sentences[2]: pytest.approx(0.2760223553913001),
    }
    assert pytest.approx(sum(ratings.values())) == 1
Ejemplo n.º 28
0
def test_cue_letters_case():
    document = build_document(
        ("X X X", "x x x x",),
        ("w w w", "W W W W",)
    )

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("X", "w",)
    summarizer.stigma_words = ("stigma",)

    sentences = summarizer.cue_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "x x x x",
        "W W W W",
    ]
Ejemplo n.º 29
0
def test_unique_words():
    document = build_document(
        ("Nějaký muž šel kolem naší zahrady", "Nějaký muž šel kolem vaší zahrady",),
        ("Už už abych taky šel",),
    )

    assert sorted(frozenset(document.words)) == [
        "Nějaký",
        "Už",
        "abych",
        "kolem",
        "muž",
        "naší",
        "taky",
        "už",
        "vaší",
        "zahrady",
        "šel",
    ]
Ejemplo n.º 30
0
def test_key_3():
    document = build_document(
        ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
        ("x X x X",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("wa", "X",)

    sentences = summarizer.key_method(document, 3)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa",
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
    ]

    sentences = summarizer.key_method(document, 3, weight=0)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
        "x X x X",
    ]