def test_key_empty(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) sentences = summarizer.key_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_location_method_2(self): document = build_document_from_string(""" # na nb nc ha hb ha = 1 + 1 + 0 = 2 middle = 0 ha hb = 2 + 1 + 0 = 3 first = 1 ha hb ha = 3 last = 1 # hc hd hb hc hd = 3 + 1 + 0 = 4 ha hb = 2 + 1 + 0 = 3 """) summarizer = EdmundsonSummarizer() summarizer.null_words = ("na", "nb", "nc", "nd", "ne",) sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) self.assertEqual(len(sentences), 4) self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3") self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3") self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4") self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
def test_cue_3(self): document = build_document(( "ba " * 10, "bb " * 10, " sa" * 8 + " bb" * 10, "bb bc ba", ), (), ( "babbbc " * 10, "na nb nc nd sa" + " bc" * 10, " ba n" * 10, )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip()) self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip()) self.assertEqual(to_unicode(sentences[2]), "bb bc ba") self.assertEqual(to_unicode(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())
def test_location_method_2(): document = build_document_from_string(""" # na nb nc ha hb ha = 1 + 1 + 0 = 2 middle = 0 ha hb = 2 + 1 + 0 = 3 first = 1 ha hb ha = 3 last = 1 # hc hd hb hc hd = 3 + 1 + 0 = 4 ha hb = 2 + 1 + 0 = 3 """) summarizer = EdmundsonSummarizer() summarizer.null_words = ("na", "nb", "nc", "nd", "ne",) sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) assert list(map(to_unicode, sentences)) == [ "ha hb = 2 + 1 + 0 = 3", "ha hb ha = 3", "hb hc hd = 3 + 1 + 0 = 4", "ha hb = 2 + 1 + 0 = 3", ]
def edmunson(self, text): # Sprache wählen language = "german" # Die Prozentzahl vom Schieberegler ziehen divident = 100 / self.scale.get() # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) # Spezifische Wortlisten definieren # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = ["nsdgdf"] summarizer.stigma_words = ["mtrtf"] summarizer.null_words = ["zngg"] summary = "" count = 0 # Anzahl der Sätzte zählen for sentence in summarizer(parser.document, 10000000000): count += 1 # Die Satzanzahl aus dem Przentanteil ermitteln sentence_number = round(count / divident) # Die Sätze zu einem Text zusammenfügen for sentence in summarizer(parser.document, sentence_number): summary += " " + str(sentence) return summary
def test_cue_3(): document = build_document(( "ba " * 10, "bb " * 10, " sa" * 8 + " bb" * 10, "bb bc ba", ), (), ( "babbbc " * 10, "na nb nc nd sa" + " bc" * 10, " ba n" * 10, )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 5) assert list(map(to_unicode, sentences)) == [ ("ba " * 10).strip(), ("bb " * 10).strip(), "bb bc ba", "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc", ("ba n " * 10).strip(), ]
def test_cue_empty(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_key_3(): document = build_document(( "wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ), ("x X x X", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "wa", "X", ) sentences = summarizer.key_method(document, 3) assert list(map(to_unicode, sentences)) == [ "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ] sentences = summarizer.key_method(document, 3, weight=0) assert list(map(to_unicode, sentences)) == [ "wa wa wa wa", "wa Wa Wa Wa wa", "x X x X", ]
def test_title_method_2(): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer() summarizer.null_words = ( "this", "is", "I", "am", "and", ) sentences = summarizer.title_method(document, 2) assert list(map(to_unicode, sentences)) == [ "This is next paragraph because of blank line above", "Here is the winner because contains words like cool and heading", ]
def test_mixed_cue_key(self): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0) summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because") summarizer.stigma_words = ( "this", "is", "I", "am", "and", ) sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Because I am sentence I like words") self.assertEqual( to_unicode(sentences[1]), "Here is the winner because contains words like cool and heading")
def test_location_method_with_empty_document(): summarizer = EdmundsonSummarizer() summarizer.null_words = ("na", "nb", "nc",) sentences = summarizer.location_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_location_method_2(): document = build_document_from_string(""" # na nb nc ha hb ha = 1 + 1 + 0 = 2 middle = 0 ha hb = 2 + 1 + 0 = 3 first = 1 ha hb ha = 3 last = 1 # hc hd hb hc hd = 3 + 1 + 0 = 4 ha hb = 2 + 1 + 0 = 3 """) summarizer = EdmundsonSummarizer() summarizer.null_words = ( "na", "nb", "nc", "nd", "ne", ) sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) assert list(map(to_unicode, sentences)) == [ "ha hb = 2 + 1 + 0 = 3", "ha hb ha = 3", "hb hc hd = 3 + 1 + 0 = 4", "ha hb = 2 + 1 + 0 = 3", ]
def test_location_method_2(self): document = build_document_from_string(""" # na nb nc ha hb ha = 1 + 1 + 0 = 2 middle = 0 ha hb = 2 + 1 + 0 = 3 first = 1 ha hb ha = 3 last = 1 # hc hd hb hc hd = 3 + 1 + 0 = 4 ha hb = 2 + 1 + 0 = 3 """) summarizer = EdmundsonSummarizer() summarizer.null_words = ( "na", "nb", "nc", "nd", "ne", ) sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) self.assertEqual(len(sentences), 4) self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3") self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3") self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4") self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
def test_key_3(self): document = build_document(( "wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ), ("x X x X", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "wa", "X", ) sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa") sentences = summarizer.key_method(document, 3, weight=0) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa") self.assertEqual(to_unicode(sentences[2]), "x X x X")
def test_cue_3(self): document = build_document( ( "ba "*10, "bb "*10, " sa"*8 + " bb"*10, "bb bc ba", ), (), ( "babbbc "*10, "na nb nc nd sa" + " bc"*10, " ba n"*10, ) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip()) self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip()) self.assertEqual(to_unicode(sentences[2]), "bb bc ba") self.assertEqual(to_unicode(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())
def test_title_method_3(self): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer() summarizer.null_words = ( "this", "is", "I", "am", "and", ) sentences = summarizer.title_method(document, 3) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "Because I am sentence I like words") self.assertEqual(to_unicode(sentences[1]), "This is next paragraph because of blank line above") self.assertEqual( to_unicode(sentences[2]), "Here is the winner because contains words like cool and heading")
def test_null_words_property(): summarizer = EdmundsonSummarizer() assert summarizer.null_words == frozenset() words = ("word", "another", "and", "some", "next",) summarizer.null_words = words assert summarizer.null_words == frozenset(words)
def test_null_words_property(self): summarizer = EdmundsonSummarizer() self.assertEqual(summarizer.null_words, frozenset()) words = ("word", "another", "and", "some", "next",) summarizer.null_words = words self.assertTrue(isinstance(summarizer.null_words, frozenset)) self.assertEqual(summarizer.null_words, frozenset(words))
def test_cue_with_no_stigma_words(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "great", "very", "beautiful", ) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
def test_cue_with_no_stigma_words(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "great", "very", "beautiful", ) with pytest.raises(ValueError): summarizer.cue_method(build_document(), 10)
def test_location_method_with_empty_document(self): summarizer = EdmundsonSummarizer() summarizer.null_words = ( "na", "nb", "nc", ) sentences = summarizer.location_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_cue_with_no_bonus_words(): summarizer = EdmundsonSummarizer() summarizer.stigma_words = ( "useless", "bad", "spinach", ) with pytest.raises(ValueError): summarizer.cue_method(build_document(), 10)
def test_cue_with_no_bonus_words(self): summarizer = EdmundsonSummarizer() summarizer.stigma_words = ( "useless", "bad", "spinach", ) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
def test_title_method_with_empty_document(): summarizer = EdmundsonSummarizer() summarizer.null_words = ( "ba", "bb", "bc", ) sentences = summarizer.title_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_title_method_with_empty_document(self): summarizer = EdmundsonSummarizer() summarizer.null_words = ( "ba", "bb", "bc", ) sentences = summarizer.title_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_key_empty(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) sentences = summarizer.key_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def __summarize(self, parser): summarizer = EdmundsonSummarizer(Stemmer(self.__language)) # words of high importance summarizer.bonus_words = ('info', 'information', 'due', 'overdue', 'withdraw', 'balance', 'fee', 'letter', 'compliance', 'super') # words of low importance or even negative importance? summarizer.stigma_words = 'zdfgthdvndadv' summarizer.null_words = 'zdfgthdvndadv' final_sentences = summarizer(parser.document, self.__sentences_count) return self.__join_sentences(final_sentences)
def test_key_empty(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) sentences = summarizer.key_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_cue_1(self): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 1)
def test_location_method_with_empty_document(): summarizer = EdmundsonSummarizer() summarizer.null_words = ( "na", "nb", "nc", ) sentences = summarizer.location_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_key_1(self): document = build_document( ("wa wb wc wd", "I like music",), ("This is test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) sentences = summarizer.key_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "This is test sentence with some extra words and bonus")
def test_key_1(): document = build_document( ("wa wb wc wd", "I like music",), ("This is test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) sentences = summarizer.key_method(document, 1) assert list(map(to_unicode, sentences)) == [ "This is test sentence with some extra words and bonus", ]
def test_key_2(self): document = build_document( ("Om nom nom nom nom", "Sure I summarize it, with bonus",), ("This is bonus test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("nom", "bonus",) sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom") self.assertEqual(to_unicode(sentences[1]), "This is bonus test sentence with some extra words and bonus")
def test_key_2(): document = build_document( ("Om nom nom nom nom", "Sure I summarize it, with bonus",), ("This is bonus test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("nom", "bonus",) sentences = summarizer.key_method(document, 2) assert list(map(to_unicode, sentences)) == [ "Om nom nom nom nom", "This is bonus test sentence with some extra words and bonus", ]
def test_title_method_without_title(self): document = build_document( ("This is sentence", "This is another one",), ("And some next sentence but no heading",) ) summarizer = EdmundsonSummarizer() summarizer.null_words = ("this", "is", "some", "and",) sentences = summarizer.title_method(document, 10) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "This is sentence") self.assertEqual(to_unicode(sentences[1]), "This is another one") self.assertEqual(to_unicode(sentences[2]), "And some next sentence but no heading")
def test_cue_letters_case(self): document = build_document( ("X X X", "x x x x",), ("w w w", "W W W W",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("X", "w",) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "x x x x") self.assertEqual(to_unicode(sentences[1]), "W W W W")
def Edmundson(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = EdmundsonSummarizer(stemmer) # Luhn算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def test_title_method_without_title(): document = build_document( ("This is sentence", "This is another one",), ("And some next sentence but no heading",) ) summarizer = EdmundsonSummarizer() summarizer.null_words = ("this", "is", "some", "and",) sentences = summarizer.title_method(document, 10) assert list(map(to_unicode, sentences)) == [ "This is sentence", "This is another one", "And some next sentence but no heading", ]
def test_null_words_property(self): summarizer = EdmundsonSummarizer() self.assertEqual(summarizer.null_words, frozenset()) words = ( "word", "another", "and", "some", "next", ) summarizer.null_words = words self.assertTrue(isinstance(summarizer.null_words, frozenset)) self.assertEqual(summarizer.null_words, frozenset(words))
def test_null_words_property(): summarizer = EdmundsonSummarizer() assert summarizer.null_words == frozenset() words = ( "word", "another", "and", "some", "next", ) summarizer.null_words = words assert summarizer.null_words == frozenset(words)
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join( [obj._text for obj in summarizer(parser.document, length)]) return summary
def test_cue_letters_case(): document = build_document( ("X X X", "x x x x",), ("w w w", "W W W W",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("X", "w",) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) assert list(map(to_unicode, sentences)) == [ "x x x x", "W W W W", ]
def EdmundsonSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = EdmundsonSummarizer() summary = summarizer(parser.document, sentences) # for sentence in summary: # print(sentence) return summary
def test_key_2(self): document = build_document(( "Om nom nom nom nom", "Sure I summarize it, with bonus", ), ("This is bonus test sentence with some extra words and bonus", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "nom", "bonus", ) sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom") self.assertEqual( to_unicode(sentences[1]), "This is bonus test sentence with some extra words and bonus")
def test_empty_document(self): summarizer = EdmundsonSummarizer(cue_weight=0, key_weight=0, title_weight=0, location_weight=0) sentences = summarizer(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_cue_1(self): document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 1)
def test_title_method_1(self): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer() summarizer.null_words = ("this", "is", "I", "am", "and",) sentences = summarizer.title_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "Here is the winner because contains words like cool and heading")
def test_key_3(self): document = build_document( ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), ("x X x X",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("wa", "X",) sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa") sentences = summarizer.key_method(document, 3, weight=0) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa") self.assertEqual(to_unicode(sentences[2]), "x X x X")
def test_cue_2(self): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",), ("Pepek likes spinach",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach") sentences = summarizer.cue_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb")
def test_mixed_cue_key(): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0) summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because") summarizer.stigma_words = ("this", "is", "I", "am", "and",) sentences = summarizer(document, 2) assert list(map(to_unicode, sentences)) == [ "Because I am sentence I like words", "Here is the winner because contains words like cool and heading", ]
def test_key_3(): document = build_document( ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), ("x X x X",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("wa", "X",) sentences = summarizer.key_method(document, 3) assert list(map(to_unicode, sentences)) == [ "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ] sentences = summarizer.key_method(document, 3, weight=0) assert list(map(to_unicode, sentences)) == [ "wa wa wa wa", "wa Wa Wa Wa wa", "x X x X", ]
def test_title_method_3(): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer() summarizer.null_words = ("this", "is", "I", "am", "and",) sentences = summarizer.title_method(document, 3) assert list(map(to_unicode, sentences)) == [ "Because I am sentence I like words", "This is next paragraph because of blank line above", "Here is the winner because contains words like cool and heading", ]
def test_cue_2(): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",), ("Pepek likes spinach",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", "Pepek likes spinach", ] sentences = summarizer.cue_method(document, 1) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", ]
def test_cue_with_no_bonus_words(self): summarizer = EdmundsonSummarizer() summarizer.stigma_words = ("useless", "bad", "spinach",) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
sys.setdefaultencoding('utf8') """ nltk.data.path.append('/home/kariminf/Data/NLTK/') for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) """ file = open(SIZE_FILE, 'r') while 1: line = file.readline() if line == '': break; parts = line.split(",") sizes[parts[0]] = int(parts[1]) file.close() nltk.data.path.append('/home/kariminf/Data/NLTK/') for eval in sizes: txt_path = "src/body/text/en/" + eval parser = PlaintextParser.from_file(txt_path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = extract(summarizer, sizes[eval]) fout = open("baselines/EdmundsonSummarizer/en/" + eval[:-9] + ".txt", "w") fout.write(summary) fout.close()
def test_cue_with_no_stigma_words(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("great", "very", "beautiful",) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
def test_location_method_with_empty_document(self): summarizer = EdmundsonSummarizer() summarizer.null_words = ("na", "nb", "nc",) sentences = summarizer.location_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_title_method_with_empty_document(self): summarizer = EdmundsonSummarizer() summarizer.null_words = ("ba", "bb", "bc",) sentences = summarizer.title_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_cue_3(): document = build_document( ( "ba "*10, "bb "*10, " sa"*8 + " bb"*10, "bb bc ba", ), (), ( "babbbc "*10, "na nb nc nd sa" + " bc"*10, " ba n"*10, ) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 5) assert list(map(to_unicode, sentences)) == [ ("ba "*10).strip(), ("bb "*10).strip(), "bb bc ba", "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc", ("ba n "*10).strip(), ]