Beispiel #1
1
def test_key_empty():
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)

    sentences = summarizer.key_method(build_document(), 10)

    assert list(map(to_unicode, sentences)) == []
def test_cue_2():
    document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ),
                              ("Pepek likes spinach", ))

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "ba",
        "bb",
        "bc",
    )
    summarizer.stigma_words = (
        "sa",
        "sb",
        "sc",
    )

    sentences = summarizer.cue_method(document, 10)

    assert list(map(to_unicode, sentences)) == [
        "ba bb bc bb unknown ľščťžýáíé sb sc sb",
        "Pepek likes spinach",
    ]

    sentences = summarizer.cue_method(document, 1)

    assert list(map(to_unicode, sentences)) == [
        "ba bb bc bb unknown ľščťžýáíé sb sc sb",
    ]
Beispiel #3
0
    def test_cue_2(self):
        document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ),
                                  ("Pepek likes spinach", ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Beispiel #4
0
    def test_cue_3(self):
        document = build_document((
            "ba " * 10,
            "bb " * 10,
            " sa" * 8 + " bb" * 10,
            "bb bc ba",
        ), (), (
            "babbbc " * 10,
            "na nb nc nd sa" + " bc" * 10,
            " ba n" * 10,
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
                         "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())
Beispiel #5
0
    def test_key_3(self):
        document = build_document((
            "wa",
            "wa wa",
            "wa wa wa",
            "wa wa wa wa",
            "wa Wa Wa Wa wa",
        ), ("x X x X", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "wa",
            "X",
        )

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Beispiel #6
0
    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1,
                                         key_weight=1,
                                         title_weight=0,
                                         location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words",
                                  "like", "because")
        summarizer.stigma_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(
            to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
def test_cue_3():
    document = build_document((
        "ba " * 10,
        "bb " * 10,
        " sa" * 8 + " bb" * 10,
        "bb bc ba",
    ), (), (
        "babbbc " * 10,
        "na nb nc nd sa" + " bc" * 10,
        " ba n" * 10,
    ))
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "ba",
        "bb",
        "bc",
    )
    summarizer.stigma_words = (
        "sa",
        "sb",
        "sc",
    )

    sentences = summarizer.cue_method(document, 5)

    assert list(map(to_unicode, sentences)) == [
        ("ba " * 10).strip(),
        ("bb " * 10).strip(),
        "bb bc ba",
        "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc",
        ("ba n " * 10).strip(),
    ]
def test_key_3():
    document = build_document((
        "wa",
        "wa wa",
        "wa wa wa",
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
    ), ("x X x X", ))
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "wa",
        "X",
    )

    sentences = summarizer.key_method(document, 3)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa",
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
    ]

    sentences = summarizer.key_method(document, 3, weight=0)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
        "x X x X",
    ]
    def edmunson(self, text):

        # Sprache wählen
        language = "german"
        # Die Prozentzahl vom Schieberegler ziehen
        divident = 100 / self.scale.get()

        # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen
        parser = PlaintextParser.from_string(text, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = Summarizer(stemmer)

        # Spezifische Wortlisten definieren
        # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert
        summarizer.stop_words = get_stop_words(language)
        summarizer.bonus_words = ["nsdgdf"]
        summarizer.stigma_words = ["mtrtf"]
        summarizer.null_words = ["zngg"]

        summary = ""
        count = 0

        # Anzahl der Sätzte zählen
        for sentence in summarizer(parser.document, 10000000000):
            count += 1

        # Die Satzanzahl aus dem Przentanteil ermitteln
        sentence_number = round(count / divident)

        # Die Sätze zu einem Text zusammenfügen
        for sentence in summarizer(parser.document, sentence_number):
            summary += " " + str(sentence)

        return summary
Beispiel #10
0
    def test_cue_empty(self):
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(build_document(), 10)
        self.assertEqual(len(sentences), 0)
Beispiel #11
0
    def test_cue_3(self):
        document = build_document(
            (
                "ba "*10,
                "bb "*10,
                " sa"*8 + " bb"*10,
                "bb bc ba",
            ),
            (),
            (
                "babbbc "*10,
                "na nb nc nd sa" + " bc"*10,
                " ba n"*10,
            )
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
            "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())
Beispiel #12
0
    def test_bonus_words_property(self):
        summarizer = EdmundsonSummarizer()

        self.assertEqual(summarizer.bonus_words, frozenset())

        words = ("word", "another", "and", "some", "next",)
        summarizer.bonus_words = words
        self.assertTrue(isinstance(summarizer.bonus_words, frozenset))
        self.assertEqual(summarizer.bonus_words, frozenset(words))
Beispiel #13
0
def test_bonus_words_property():
    summarizer = EdmundsonSummarizer()

    assert summarizer.bonus_words == frozenset()

    words = ("word", "another", "and", "some", "next",)
    summarizer.bonus_words = words

    assert summarizer.bonus_words == frozenset(words)
Beispiel #14
0
    def test_key_empty(self):
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )

        sentences = summarizer.key_method(build_document(), 10)
        self.assertEqual(len(sentences), 0)
def test_cue_with_no_stigma_words():
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "great",
        "very",
        "beautiful",
    )

    with pytest.raises(ValueError):
        summarizer.cue_method(build_document(), 10)
Beispiel #16
0
    def test_cue_with_no_stigma_words(self):
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "great",
            "very",
            "beautiful",
        )

        self.assertRaises(ValueError, summarizer.cue_method, build_document(),
                          10)
 def __summarize(self, parser):
     summarizer = EdmundsonSummarizer(Stemmer(self.__language))
     # words of high importance
     summarizer.bonus_words = ('info', 'information', 'due', 'overdue',
                               'withdraw', 'balance', 'fee', 'letter',
                               'compliance', 'super')
     # words of low importance or even negative importance?
     summarizer.stigma_words = 'zdfgthdvndadv'
     summarizer.null_words = 'zdfgthdvndadv'
     final_sentences = summarizer(parser.document, self.__sentences_count)
     return self.__join_sentences(final_sentences)
Beispiel #18
0
    def test_cue_1(self):
        document = build_document(
            ("ba bb bc bb unknown ľščťžýáíé sb sc sb",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 1)
def test_key_empty():
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "ba",
        "bb",
        "bc",
    )

    sentences = summarizer.key_method(build_document(), 10)

    assert list(map(to_unicode, sentences)) == []
Beispiel #20
0
    def test_key_1(self):
        document = build_document(
            ("wa wb wc wd", "I like music",),
            ("This is test sentence with some extra words and bonus",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

        sentences = summarizer.key_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
            "This is test sentence with some extra words and bonus")
Beispiel #21
0
def summarize(text, sentence_count, bonus_words, language='english'):
    '''

    '''
    summarizer = EdmundsonSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    summarizer.bonus_words = bonus_words
    summarizer.stigma_words = ['zdfgthdvndadv']
    summarizer.null_words = stopwords.words('english')
    summary = summarizer(
        PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary
Beispiel #22
0
    def test_key_2(self):
        document = build_document(
            ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
            ("This is bonus test sentence with some extra words and bonus",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("nom", "bonus",)

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Beispiel #23
0
def test_key_1():
    document = build_document(
        ("wa wb wc wd", "I like music",),
        ("This is test sentence with some extra words and bonus",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

    sentences = summarizer.key_method(document, 1)

    assert list(map(to_unicode, sentences)) == [
        "This is test sentence with some extra words and bonus",
    ]
Beispiel #24
0
def edmundson_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = EdmundsonSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    summarizer_luhn.bonus_words = ("computing", "learning", "mobile")
    summarizer_luhn.stigma_words = ("another", "and", "some", "next")
    summarizer_luhn.null_words = ("another", "and", "some", "next")

    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
Beispiel #25
0
    def test_cue_letters_case(self):
        document = build_document(
            ("X X X", "x x x x",),
            ("w w w", "W W W W",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("X", "w",)
        summarizer.stigma_words = ("stigma",)

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Beispiel #26
0
def summarize(srt_file, summarizer, n_sentences, language, bonusWords,
              stigmaWords):
    # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions.
    ##print(srt_to_doc(srt_file))
    parser = PlaintextParser.from_string(srt_to_doc(srt_file),
                                         Tokenizer(language))

    if (summarizer == 'ED'):
        summarizer = EdmundsonSummarizer()

        with open(bonusWords, "r+") as f:
            bonus_wordsList = f.readlines()
            bonus_wordsList = [x.strip() for x in bonus_wordsList]
            f.close()
        with open(stigmaWords, "r+") as f:
            stigma_wordsList = f.readlines()
            stigma_wordsList = [x.strip() for x in stigma_wordsList]
            f.close()

        summarizer.bonus_words = (bonus_wordsList)
        summarizer.stigma_words = (stigma_wordsList)
        summarizer.null_words = get_stop_words(language)
    else:
        stemmer = Stemmer(language)
        summarizer = SUMMARIZERS[summarizer](stemmer)
        summarizer.stop_words = get_stop_words(language)

    ret = []
    summarizedSubtitles = []
    # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence
    # for sentence in parser.document:
    #     print("sentence ",sentence)
    # print("cod ",srt_file)
    # for ob in srt_file:
    #         sent=srt_to_doc([ob])
    #         print("sent ",sent[4:])

    for sentence in summarizer(parser.document, n_sentences):
        # Index of the sentence
        # print("sentence ",sentence)
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        # Using the index we determine the subtitle to be selected
        item = srt_file[index]
        # print("item ",item)
        summarizedSubtitles.append(item)

        # add the selected subtitle to the result array
        ret.append(srt_item_to_range(item))

    return ret, summarizedSubtitles
Beispiel #27
0
def test_key_2():
    document = build_document(
        ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
        ("This is bonus test sentence with some extra words and bonus",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("nom", "bonus",)

    sentences = summarizer.key_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "Om nom nom nom nom",
        "This is bonus test sentence with some extra words and bonus",
    ]
def test_bonus_words_property():
    summarizer = EdmundsonSummarizer()

    assert summarizer.bonus_words == frozenset()

    words = (
        "word",
        "another",
        "and",
        "some",
        "next",
    )
    summarizer.bonus_words = words

    assert summarizer.bonus_words == frozenset(words)
Beispiel #29
0
    def test_bonus_words_property(self):
        summarizer = EdmundsonSummarizer()

        self.assertEqual(summarizer.bonus_words, frozenset())

        words = (
            "word",
            "another",
            "and",
            "some",
            "next",
        )
        summarizer.bonus_words = words
        self.assertTrue(isinstance(summarizer.bonus_words, frozenset))
        self.assertEqual(summarizer.bonus_words, frozenset(words))
Beispiel #30
0
def test_cue_letters_case():
    document = build_document(
        ("X X X", "x x x x",),
        ("w w w", "W W W W",)
    )

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("X", "w",)
    summarizer.stigma_words = ("stigma",)

    sentences = summarizer.cue_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "x x x x",
        "W W W W",
    ]
def test_key_2():
    document = build_document((
        "Om nom nom nom nom",
        "Sure I summarize it, with bonus",
    ), ("This is bonus test sentence with some extra words and bonus", ))
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "nom",
        "bonus",
    )

    sentences = summarizer.key_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "Om nom nom nom nom",
        "This is bonus test sentence with some extra words and bonus",
    ]
Beispiel #32
0
    def test_key_2(self):
        document = build_document((
            "Om nom nom nom nom",
            "Sure I summarize it, with bonus",
        ), ("This is bonus test sentence with some extra words and bonus", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "nom",
            "bonus",
        )

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(
            to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Beispiel #33
0
    def test_cue_1(self):
        document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 1)
Beispiel #34
0
    def test_key_1(self):
        document = build_document((
            "wa wb wc wd",
            "I like music",
        ), ("This is test sentence with some extra words and bonus", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
            "bonus",
        )

        sentences = summarizer.key_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(
            to_unicode(sentences[0]),
            "This is test sentence with some extra words and bonus")
def test_key_1():
    document = build_document((
        "wa wb wc wd",
        "I like music",
    ), ("This is test sentence with some extra words and bonus", ))
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "ba",
        "bb",
        "bc",
        "bonus",
    )

    sentences = summarizer.key_method(document, 1)

    assert list(map(to_unicode, sentences)) == [
        "This is test sentence with some extra words and bonus",
    ]
Beispiel #36
0
    def test_key_3(self):
        document = build_document(
            ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
            ("x X x X",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("wa", "X",)

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Beispiel #37
0
    def test_cue_letters_case(self):
        document = build_document((
            "X X X",
            "x x x x",
        ), (
            "w w w",
            "W W W W",
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "X",
            "w",
        )
        summarizer.stigma_words = ("stigma", )

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Beispiel #38
0
def test_mixed_cue_key():
    document = build_document_from_string("""
        # This is cool heading
        Because I am sentence I like words
        And because I am string I like characters

        # blank and heading
        This is next paragraph because of blank line above
        Here is the winner because contains words like cool and heading
    """)
    summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0)
    summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because")
    summarizer.stigma_words = ("this", "is", "I", "am", "and",)

    sentences = summarizer(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "Because I am sentence I like words",
        "Here is the winner because contains words like cool and heading",
    ]
Beispiel #39
0
    def test_cue_2(self):
        document = build_document(
            ("ba bb bc bb unknown ľščťžýáíé sb sc sb",),
            ("Pepek likes spinach",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Beispiel #40
0
def test_key_3():
    document = build_document(
        ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
        ("x X x X",)
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("wa", "X",)

    sentences = summarizer.key_method(document, 3)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa",
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
    ]

    sentences = summarizer.key_method(document, 3, weight=0)
    assert list(map(to_unicode, sentences)) == [
        "wa wa wa wa",
        "wa Wa Wa Wa wa",
        "x X x X",
    ]
def test_cue_letters_case():
    document = build_document((
        "X X X",
        "x x x x",
    ), (
        "w w w",
        "W W W W",
    ))

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = (
        "X",
        "w",
    )
    summarizer.stigma_words = ("stigma", )

    sentences = summarizer.cue_method(document, 2)

    assert list(map(to_unicode, sentences)) == [
        "x x x x",
        "W W W W",
    ]
Beispiel #42
0
def test_cue_2():
    document = build_document(
        ("ba bb bc bb unknown ľščťžýáíé sb sc sb",),
        ("Pepek likes spinach",)
    )

    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)
    summarizer.stigma_words = ("sa", "sb", "sc",)

    sentences = summarizer.cue_method(document, 10)

    assert list(map(to_unicode, sentences)) == [
        "ba bb bc bb unknown ľščťžýáíé sb sc sb",
        "Pepek likes spinach",
    ]

    sentences = summarizer.cue_method(document, 1)

    assert list(map(to_unicode, sentences)) == [
        "ba bb bc bb unknown ľščťžýáíé sb sc sb",
    ]
Beispiel #43
0
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    #define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    #store summaries in a text  file
    #list_of_sums = [summary_Lsa, summary_LexRank, summary_Edmundson]
    #f = open('summarized.txt', 'w')
    #for t in list_of_sums:
    #    line = ' '.join(str(x) for x in t)
    #    f.write(line + '\n')
    #f.close()

    #create new shorter summaries
    #parser = PlaintextParser.from_file("summarized.txt", Tokenizer(LANGUAGE))
    #SENTENCES_COUNT = 3
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer   #found this is the best as
from nltk.corpus import stopwords
from string import punctuation


LANGUAGE = "english"
SENTENCES_COUNT = 5

if __name__ == "__main__":

    url="https://www.artsy.net/article/artsy-editorial-photographing-fading-american-dream-prefab-homes"

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    print ("--EdmundsonSummarizer--")
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("deep", "learning", "neural" )
    summarizer.stigma_words = set(stopwords.words('english') + list(punctuation))
    summarizer.null_words = ["art"]
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Beispiel #45
0
def test_cue_with_no_stigma_words():
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("great", "very", "beautiful",)

    with pytest.raises(ValueError):
        summarizer.cue_method(build_document(), 10)
Beispiel #46
0
def summary(article_url):
    url = article_url
    #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato"
    # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage"
    # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident"
    # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # create a list of reference sentences to calculate ROUGE_N scores
    ref_sentences = []
    trim_ref_sentences = []
    for paragraph in parser._article.main_text:
        for sections in paragraph:
            for sentences in sections:
                try:
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    # catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    # define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    # print summaries
    summary_Lsa_trim = []
    for sentence in summary_Lsa:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Lsa_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Lsa_trim, trim_ref_sentences)

    print('\n')
    summary_LexRank_trim = []
    for sentence in summary_LexRank:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_LexRank_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_LexRank_trim, trim_ref_sentences)

    print('\n')
    summary_Edmundson_trim = []
    for sentence in summary_Edmundson:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Edmundson_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Edmundson_trim, trim_ref_sentences)

    # returns index of max 0=Ed, 1=Lsa, 2=Lex
    models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"}
    best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim,
                               summary_Edmundson_trim, trim_ref_sentences)
    print(
        models.get(best_summary) +
        ' is the best model according to an average of the Rouge_3, 2 and 1 tests'
    )

    #return the summary of the best model
    if (best_summary == 0):
        return summary_Edmundson_trim
    elif (best_summary == 1):
        return summary_Lsa_trim
    elif (best_summary == 2):
        return summary_LexRank_trim
Beispiel #47
0
def summarize():
    """Summarize contents of urls
    This function will generate summary for contents of urls from database and extract a random 
    reference image for it and also store its title.
    """

    global linkTree
    global SENTENCES_COUNT

    for i in range(len(linkTree)):

        length_is_appropriate = False
        max_length = 90
        obj = linkTree[i]
        print("Summarizing for ", obj["url"])

        if not obj["abstract"] == "To be filled":
            print("Exists for ", obj["url"])
            continue

        while not length_is_appropriate:

            url = obj["url"]
            try:
                parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

                response = requests.get(url)
                soup = BeautifulSoup(response.text, "html.parser")
                title = str(soup.find('title').text)

                rand = 0
                img_link = ""
                images = soup.findAll('img')
                image_count = len(images)

                if image_count < 3:
                    img_link = ""
                else:
                    if image_count > 15:
                        rand = int(random.random() * 1000) % (image_count - 10)
                        rand = rand + 5
                    elif image_count >= 3:
                        rand = int(int(random.random() * 1000) % (image_count))

                    if images[rand].has_attr("src"):
                        img_link = str(images[rand]['src'])

                    elif images[rand].has_attr("data-src"):
                        img_link = str(images[rand]['data-src'])

                summary = ""
                summarizer = EdmundsonSummarizer()
                words = keywordFetcher.fetchKeyTerms()
                summarizer.bonus_words = words

                words = ("another", "and", "some", "next")
                summarizer.stigma_words = words

                words = ("another", "and", "some", "next")
                summarizer.null_words = words

                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    summary += str(sentence)
                    summary += " "

                if len(summary.split()) <= max_length:
                    length_is_appropriate = True
                    SENTENCES_COUNT = max
                    print("Found summary of appropriate length")
                else:
                    print("Summary word count: " + str(len(summary.split())))
                    SENTENCES_COUNT = SENTENCES_COUNT - 1
                    continue

                obj["abstract"] = summary
                obj["title"] = title
                obj["img"] = img_link

                print(obj)
                linkTree[i] = obj

            except Exception as e:
                print("Error")
                print(e)
                length_is_appropriate = True
                continue

        SENTENCES_COUNT = MAX_SENTENCES_COUNT
        db.reference("linkTree").set(linkTree)
Beispiel #48
0
SENTENCES_COUNT = 4

parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

print("\n====== Luhn ======")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
    print(sentenceLuhn, "\n")

print("====== TextRank ======")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
    print(sentenceTR, "\n")

print("====== LSA ======")
summarizerLSA = LsaSummarizer(stemmer)
summarizerLSA.stop_words = get_stop_words(LANGUAGE)
for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT):
    print(sentenceLSA, "\n")

print("====== Edmonson ======")
summarizerEd = EdmundsonSummarizer(stemmer)
summarizerEd.bonus_words = ('focus', 'proposed', 'method', 'describes')
summarizerEd.stigma_words = ('example')
summarizerEd.null_words = ('literature', 'however')
for sentenceEd in summarizerEd(parser.document, SENTENCES_COUNT):
    print(sentenceEd, "\n")
Beispiel #49
0
    def test_cue_with_no_stigma_words(self):
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("great", "very", "beautiful",)

        self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
Beispiel #50
-1
def test_cue_3():
    document = build_document(
        (
            "ba "*10,
            "bb "*10,
            " sa"*8 + " bb"*10,
            "bb bc ba",
        ),
        (),
        (
            "babbbc "*10,
            "na nb nc nd sa" + " bc"*10,
            " ba n"*10,
        )
    )
    summarizer = EdmundsonSummarizer()
    summarizer.bonus_words = ("ba", "bb", "bc",)
    summarizer.stigma_words = ("sa", "sb", "sc",)

    sentences = summarizer.cue_method(document, 5)

    assert list(map(to_unicode, sentences)) == [
        ("ba "*10).strip(),
        ("bb "*10).strip(),
        "bb bc ba",
        "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc",
        ("ba n "*10).strip(),
    ]