def test_key_empty(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) sentences = summarizer.key_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_cue_2(): document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ), ("Pepek likes spinach", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 10) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", "Pepek likes spinach", ] sentences = summarizer.cue_method(document, 1) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", ]
def test_cue_2(self): document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ), ("Pepek likes spinach", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach") sentences = summarizer.cue_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb")
def test_cue_3(self): document = build_document(( "ba " * 10, "bb " * 10, " sa" * 8 + " bb" * 10, "bb bc ba", ), (), ( "babbbc " * 10, "na nb nc nd sa" + " bc" * 10, " ba n" * 10, )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip()) self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip()) self.assertEqual(to_unicode(sentences[2]), "bb bc ba") self.assertEqual(to_unicode(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())
def test_key_3(self): document = build_document(( "wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ), ("x X x X", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "wa", "X", ) sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa") sentences = summarizer.key_method(document, 3, weight=0) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa") self.assertEqual(to_unicode(sentences[2]), "x X x X")
def test_mixed_cue_key(self): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0) summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because") summarizer.stigma_words = ( "this", "is", "I", "am", "and", ) sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Because I am sentence I like words") self.assertEqual( to_unicode(sentences[1]), "Here is the winner because contains words like cool and heading")
def test_cue_3(): document = build_document(( "ba " * 10, "bb " * 10, " sa" * 8 + " bb" * 10, "bb bc ba", ), (), ( "babbbc " * 10, "na nb nc nd sa" + " bc" * 10, " ba n" * 10, )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 5) assert list(map(to_unicode, sentences)) == [ ("ba " * 10).strip(), ("bb " * 10).strip(), "bb bc ba", "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc", ("ba n " * 10).strip(), ]
def test_key_3(): document = build_document(( "wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ), ("x X x X", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "wa", "X", ) sentences = summarizer.key_method(document, 3) assert list(map(to_unicode, sentences)) == [ "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ] sentences = summarizer.key_method(document, 3, weight=0) assert list(map(to_unicode, sentences)) == [ "wa wa wa wa", "wa Wa Wa Wa wa", "x X x X", ]
def edmunson(self, text): # Sprache wählen language = "german" # Die Prozentzahl vom Schieberegler ziehen divident = 100 / self.scale.get() # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) # Spezifische Wortlisten definieren # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = ["nsdgdf"] summarizer.stigma_words = ["mtrtf"] summarizer.null_words = ["zngg"] summary = "" count = 0 # Anzahl der Sätzte zählen for sentence in summarizer(parser.document, 10000000000): count += 1 # Die Satzanzahl aus dem Przentanteil ermitteln sentence_number = round(count / divident) # Die Sätze zu einem Text zusammenfügen for sentence in summarizer(parser.document, sentence_number): summary += " " + str(sentence) return summary
def test_cue_empty(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_cue_3(self): document = build_document( ( "ba "*10, "bb "*10, " sa"*8 + " bb"*10, "bb bc ba", ), (), ( "babbbc "*10, "na nb nc nd sa" + " bc"*10, " ba n"*10, ) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip()) self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip()) self.assertEqual(to_unicode(sentences[2]), "bb bc ba") self.assertEqual(to_unicode(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())
def test_bonus_words_property(self): summarizer = EdmundsonSummarizer() self.assertEqual(summarizer.bonus_words, frozenset()) words = ("word", "another", "and", "some", "next",) summarizer.bonus_words = words self.assertTrue(isinstance(summarizer.bonus_words, frozenset)) self.assertEqual(summarizer.bonus_words, frozenset(words))
def test_bonus_words_property(): summarizer = EdmundsonSummarizer() assert summarizer.bonus_words == frozenset() words = ("word", "another", "and", "some", "next",) summarizer.bonus_words = words assert summarizer.bonus_words == frozenset(words)
def test_key_empty(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) sentences = summarizer.key_method(build_document(), 10) self.assertEqual(len(sentences), 0)
def test_cue_with_no_stigma_words(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "great", "very", "beautiful", ) with pytest.raises(ValueError): summarizer.cue_method(build_document(), 10)
def test_cue_with_no_stigma_words(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "great", "very", "beautiful", ) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
def __summarize(self, parser): summarizer = EdmundsonSummarizer(Stemmer(self.__language)) # words of high importance summarizer.bonus_words = ('info', 'information', 'due', 'overdue', 'withdraw', 'balance', 'fee', 'letter', 'compliance', 'super') # words of low importance or even negative importance? summarizer.stigma_words = 'zdfgthdvndadv' summarizer.null_words = 'zdfgthdvndadv' final_sentences = summarizer(parser.document, self.__sentences_count) return self.__join_sentences(final_sentences)
def test_cue_1(self): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 1)
def test_key_empty(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) sentences = summarizer.key_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_key_1(self): document = build_document( ("wa wb wc wd", "I like music",), ("This is test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) sentences = summarizer.key_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "This is test sentence with some extra words and bonus")
def summarize(text, sentence_count, bonus_words, language='english'): ''' ''' summarizer = EdmundsonSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = bonus_words summarizer.stigma_words = ['zdfgthdvndadv'] summarizer.null_words = stopwords.words('english') summary = summarizer( PlaintextParser(text, Tokenizer(language)).document, sentence_count) return summary
def test_key_2(self): document = build_document( ("Om nom nom nom nom", "Sure I summarize it, with bonus",), ("This is bonus test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("nom", "bonus",) sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom") self.assertEqual(to_unicode(sentences[1]), "This is bonus test sentence with some extra words and bonus")
def test_key_1(): document = build_document( ("wa wb wc wd", "I like music",), ("This is test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) sentences = summarizer.key_method(document, 1) assert list(map(to_unicode, sentences)) == [ "This is test sentence with some extra words and bonus", ]
def edmundson_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = EdmundsonSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) summarizer_luhn.bonus_words = ("computing", "learning", "mobile") summarizer_luhn.stigma_words = ("another", "and", "some", "next") summarizer_luhn.null_words = ("another", "and", "some", "next") sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def test_cue_letters_case(self): document = build_document( ("X X X", "x x x x",), ("w w w", "W W W W",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("X", "w",) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "x x x x") self.assertEqual(to_unicode(sentences[1]), "W W W W")
def summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords): # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions. ##print(srt_to_doc(srt_file)) parser = PlaintextParser.from_string(srt_to_doc(srt_file), Tokenizer(language)) if (summarizer == 'ED'): summarizer = EdmundsonSummarizer() with open(bonusWords, "r+") as f: bonus_wordsList = f.readlines() bonus_wordsList = [x.strip() for x in bonus_wordsList] f.close() with open(stigmaWords, "r+") as f: stigma_wordsList = f.readlines() stigma_wordsList = [x.strip() for x in stigma_wordsList] f.close() summarizer.bonus_words = (bonus_wordsList) summarizer.stigma_words = (stigma_wordsList) summarizer.null_words = get_stop_words(language) else: stemmer = Stemmer(language) summarizer = SUMMARIZERS[summarizer](stemmer) summarizer.stop_words = get_stop_words(language) ret = [] summarizedSubtitles = [] # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence # for sentence in parser.document: # print("sentence ",sentence) # print("cod ",srt_file) # for ob in srt_file: # sent=srt_to_doc([ob]) # print("sent ",sent[4:]) for sentence in summarizer(parser.document, n_sentences): # Index of the sentence # print("sentence ",sentence) index = int(re.findall("\(([0-9]+)\)", str(sentence))[0]) # Using the index we determine the subtitle to be selected item = srt_file[index] # print("item ",item) summarizedSubtitles.append(item) # add the selected subtitle to the result array ret.append(srt_item_to_range(item)) return ret, summarizedSubtitles
def test_key_2(): document = build_document( ("Om nom nom nom nom", "Sure I summarize it, with bonus",), ("This is bonus test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("nom", "bonus",) sentences = summarizer.key_method(document, 2) assert list(map(to_unicode, sentences)) == [ "Om nom nom nom nom", "This is bonus test sentence with some extra words and bonus", ]
def test_bonus_words_property(): summarizer = EdmundsonSummarizer() assert summarizer.bonus_words == frozenset() words = ( "word", "another", "and", "some", "next", ) summarizer.bonus_words = words assert summarizer.bonus_words == frozenset(words)
def test_bonus_words_property(self): summarizer = EdmundsonSummarizer() self.assertEqual(summarizer.bonus_words, frozenset()) words = ( "word", "another", "and", "some", "next", ) summarizer.bonus_words = words self.assertTrue(isinstance(summarizer.bonus_words, frozenset)) self.assertEqual(summarizer.bonus_words, frozenset(words))
def test_cue_letters_case(): document = build_document( ("X X X", "x x x x",), ("w w w", "W W W W",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("X", "w",) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) assert list(map(to_unicode, sentences)) == [ "x x x x", "W W W W", ]
def test_key_2(): document = build_document(( "Om nom nom nom nom", "Sure I summarize it, with bonus", ), ("This is bonus test sentence with some extra words and bonus", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "nom", "bonus", ) sentences = summarizer.key_method(document, 2) assert list(map(to_unicode, sentences)) == [ "Om nom nom nom nom", "This is bonus test sentence with some extra words and bonus", ]
def test_key_2(self): document = build_document(( "Om nom nom nom nom", "Sure I summarize it, with bonus", ), ("This is bonus test sentence with some extra words and bonus", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "nom", "bonus", ) sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom") self.assertEqual( to_unicode(sentences[1]), "This is bonus test sentence with some extra words and bonus")
def test_cue_1(self): document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", ) summarizer.stigma_words = ( "sa", "sb", "sc", ) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 1)
def test_key_1(self): document = build_document(( "wa wb wc wd", "I like music", ), ("This is test sentence with some extra words and bonus", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", "bonus", ) sentences = summarizer.key_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual( to_unicode(sentences[0]), "This is test sentence with some extra words and bonus")
def test_key_1(): document = build_document(( "wa wb wc wd", "I like music", ), ("This is test sentence with some extra words and bonus", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "ba", "bb", "bc", "bonus", ) sentences = summarizer.key_method(document, 1) assert list(map(to_unicode, sentences)) == [ "This is test sentence with some extra words and bonus", ]
def test_key_3(self): document = build_document( ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), ("x X x X",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("wa", "X",) sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa") sentences = summarizer.key_method(document, 3, weight=0) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa") self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa") self.assertEqual(to_unicode(sentences[2]), "x X x X")
def test_cue_letters_case(self): document = build_document(( "X X X", "x x x x", ), ( "w w w", "W W W W", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "X", "w", ) summarizer.stigma_words = ("stigma", ) sentences = summarizer.cue_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "x x x x") self.assertEqual(to_unicode(sentences[1]), "W W W W")
def test_mixed_cue_key(): document = build_document_from_string(""" # This is cool heading Because I am sentence I like words And because I am string I like characters # blank and heading This is next paragraph because of blank line above Here is the winner because contains words like cool and heading """) summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0) summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because") summarizer.stigma_words = ("this", "is", "I", "am", "and",) sentences = summarizer(document, 2) assert list(map(to_unicode, sentences)) == [ "Because I am sentence I like words", "Here is the winner because contains words like cool and heading", ]
def test_cue_2(self): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",), ("Pepek likes spinach",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach") sentences = summarizer.cue_method(document, 1) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb")
def test_key_3(): document = build_document( ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), ("x X x X",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("wa", "X",) sentences = summarizer.key_method(document, 3) assert list(map(to_unicode, sentences)) == [ "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ] sentences = summarizer.key_method(document, 3, weight=0) assert list(map(to_unicode, sentences)) == [ "wa wa wa wa", "wa Wa Wa Wa wa", "x X x X", ]
def test_cue_letters_case(): document = build_document(( "X X X", "x x x x", ), ( "w w w", "W W W W", )) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ( "X", "w", ) summarizer.stigma_words = ("stigma", ) sentences = summarizer.cue_method(document, 2) assert list(map(to_unicode, sentences)) == [ "x x x x", "W W W W", ]
def test_cue_2(): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",), ("Pepek likes spinach",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", "Pepek likes spinach", ] sentences = summarizer.cue_method(document, 1) assert list(map(to_unicode, sentences)) == [ "ba bb bc bb unknown ľščťžýáíé sb sc sb", ]
# or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) #define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) #store summaries in a text file #list_of_sums = [summary_Lsa, summary_LexRank, summary_Edmundson] #f = open('summarized.txt', 'w') #for t in list_of_sums: # line = ' '.join(str(x) for x in t) # f.write(line + '\n') #f.close() #create new shorter summaries #parser = PlaintextParser.from_file("summarized.txt", Tokenizer(LANGUAGE)) #SENTENCES_COUNT = 3
from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.summarizers.luhn import LuhnSummarizer from sumy.summarizers.edmundson import EdmundsonSummarizer #found this is the best as from nltk.corpus import stopwords from string import punctuation LANGUAGE = "english" SENTENCES_COUNT = 5 if __name__ == "__main__": url="https://www.artsy.net/article/artsy-editorial-photographing-fading-american-dream-prefab-homes" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) print ("--EdmundsonSummarizer--") summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("deep", "learning", "neural" ) summarizer.stigma_words = set(stopwords.words('english') + list(punctuation)) summarizer.null_words = ["art"] for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def test_cue_with_no_stigma_words(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("great", "very", "beautiful",) with pytest.raises(ValueError): summarizer.cue_method(build_document(), 10)
def summary(article_url): url = article_url #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato" # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage" # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident" # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # create a list of reference sentences to calculate ROUGE_N scores ref_sentences = [] trim_ref_sentences = [] for paragraph in parser._article.main_text: for sections in paragraph: for sentences in sections: try: if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: # catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) # print summaries summary_Lsa_trim = [] for sentence in summary_Lsa: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Lsa_trim.append(sentence) # calc rouge_n scores calc_value(summary_Lsa_trim, trim_ref_sentences) print('\n') summary_LexRank_trim = [] for sentence in summary_LexRank: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_LexRank_trim.append(sentence) # calc rouge_n scores calc_value(summary_LexRank_trim, trim_ref_sentences) print('\n') summary_Edmundson_trim = [] for sentence in summary_Edmundson: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Edmundson_trim.append(sentence) # calc rouge_n scores calc_value(summary_Edmundson_trim, trim_ref_sentences) # returns index of max 0=Ed, 1=Lsa, 2=Lex models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"} best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim, summary_Edmundson_trim, trim_ref_sentences) print( models.get(best_summary) + ' is the best model according to an average of the Rouge_3, 2 and 1 tests' ) #return the summary of the best model if (best_summary == 0): return summary_Edmundson_trim elif (best_summary == 1): return summary_Lsa_trim elif (best_summary == 2): return summary_LexRank_trim
def summarize(): """Summarize contents of urls This function will generate summary for contents of urls from database and extract a random reference image for it and also store its title. """ global linkTree global SENTENCES_COUNT for i in range(len(linkTree)): length_is_appropriate = False max_length = 90 obj = linkTree[i] print("Summarizing for ", obj["url"]) if not obj["abstract"] == "To be filled": print("Exists for ", obj["url"]) continue while not length_is_appropriate: url = obj["url"] try: parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") title = str(soup.find('title').text) rand = 0 img_link = "" images = soup.findAll('img') image_count = len(images) if image_count < 3: img_link = "" else: if image_count > 15: rand = int(random.random() * 1000) % (image_count - 10) rand = rand + 5 elif image_count >= 3: rand = int(int(random.random() * 1000) % (image_count)) if images[rand].has_attr("src"): img_link = str(images[rand]['src']) elif images[rand].has_attr("data-src"): img_link = str(images[rand]['data-src']) summary = "" summarizer = EdmundsonSummarizer() words = keywordFetcher.fetchKeyTerms() summarizer.bonus_words = words words = ("another", "and", "some", "next") summarizer.stigma_words = words words = ("another", "and", "some", "next") summarizer.null_words = words for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) summary += " " if len(summary.split()) <= max_length: length_is_appropriate = True SENTENCES_COUNT = max print("Found summary of appropriate length") else: print("Summary word count: " + str(len(summary.split()))) SENTENCES_COUNT = SENTENCES_COUNT - 1 continue obj["abstract"] = summary obj["title"] = title obj["img"] = img_link print(obj) linkTree[i] = obj except Exception as e: print("Error") print(e) length_is_appropriate = True continue SENTENCES_COUNT = MAX_SENTENCES_COUNT db.reference("linkTree").set(linkTree)
SENTENCES_COUNT = 4 parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) print("\n====== Luhn ======") summarizerLuhn = LuhnSummarizer(stemmer) summarizerLuhn.stop_words = get_stop_words(LANGUAGE) for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT): print(sentenceLuhn, "\n") print("====== TextRank ======") summarizerTR = TextRankSummarizer(stemmer) summarizerTR.stop_words = get_stop_words(LANGUAGE) for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT): print(sentenceTR, "\n") print("====== LSA ======") summarizerLSA = LsaSummarizer(stemmer) summarizerLSA.stop_words = get_stop_words(LANGUAGE) for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT): print(sentenceLSA, "\n") print("====== Edmonson ======") summarizerEd = EdmundsonSummarizer(stemmer) summarizerEd.bonus_words = ('focus', 'proposed', 'method', 'describes') summarizerEd.stigma_words = ('example') summarizerEd.null_words = ('literature', 'however') for sentenceEd in summarizerEd(parser.document, SENTENCES_COUNT): print(sentenceEd, "\n")
def test_cue_with_no_stigma_words(self): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("great", "very", "beautiful",) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10)
def test_cue_3(): document = build_document( ( "ba "*10, "bb "*10, " sa"*8 + " bb"*10, "bb bc ba", ), (), ( "babbbc "*10, "na nb nc nd sa" + " bc"*10, " ba n"*10, ) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 5) assert list(map(to_unicode, sentences)) == [ ("ba "*10).strip(), ("bb "*10).strip(), "bb bc ba", "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc", ("ba n "*10).strip(), ]