def test_single_sentence(): document = build_document(("Já jsem jedna věta",)) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem",) returned = summarizer(document, 10) assert len(returned) == 1
def summarize(final_transcript, askuser=False): print('Summarizing transcript...') parser = PlaintextParser.from_file(final_transcript, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if askuser == True: summtype = input('Summarizer type? [1: Luhn, 2: Lex-Rank, 3: Text-Rank] ') else: summtype = SUMMMETHOD if summtype == '1': summarizer = LuhnSummarizer(stemmer) typename = 'luhn' elif summtype == '2': summarizer = LexRankSummarizer(stemmer) typename = 'lex' elif summtype == '3': summarizer = TextRankSummarizer(stemmer) typename = 'tex' summarizer.stop_words = get_stop_words(LANGUAGE) count = SENTENCES_COUNT summaryfile = str(final_transcript)[:-4] + '_summ_' + typename + '.txt' for sentence in summarizer(parser.document, SENTENCES_COUNT): sentence_out = str(SENTENCES_COUNT - count + 1) + ':\n' + str(sentence) + '\n--------------\n' with open(summaryfile, 'a') as f: f.write(sentence_out) print(sentence_out) count -= 1 return summaryfile
def models_LUHN_LEX_LSA_2(dataframe): LANGUAGE = "english" stop = get_stop_words(LANGUAGE) size = len(dataframe) stemmer = Stemmer(LANGUAGE) for i in range(0, size): article = dataframe.loc[i, "post_content"] parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence1 in LUHNsentence: LUHNsummary = sentence1 for sentence2 in LEXsentence: LEXsummary = sentence2 for sentence3 in LSAsentence: LSAsummary = sentence3 dataframe.loc[i, "LUHN"] = LUHNsummary dataframe.loc[i, "LEX"] = LEXsummary dataframe.loc[i, "LSA"] = LSAsummary
def test_three_sentences(): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ]
def test_various_words_with_significant_percentage(self): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "6 e e e e e") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "5 z z z z") self.assertEqual(to_unicode(returned[1]), "6 e e e e e") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "3 c c c") self.assertEqual(to_unicode(returned[1]), "5 z z z z") self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def test_various_words_with_significant_percentage(): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "6 e e e e e", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "5 z z z z", "6 e e e e e", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "3 c c c", "5 z z z z", "6 e e e e e", ]
def test_three_sentences(self): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s", ) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa") self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
def test_single_sentence(self): document = build_document(("Já jsem jedna věta",)) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem",) returned = summarizer(document, 10) self.assertEqual(len(returned), 1)
def luhn(parser,sentence_count): summarizer_1 = LuhnSummarizer(Stemmer(language)) summarizer_1.stop_words = get_stop_words(language) summary_1 = summarizer_1(parser.document, sentence_count) temp = '' for sentence in summary_1: temp = temp + str(sentence) return (temp)
def luhn_summarizer(text, stemmer, LANGUAGE, SENTENCES_COUNT): parser = PlaintextParser.from_string(text, sumytoken(LANGUAGE)) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT): a = sentence sentences.append(str(a)) return " ".join(sentences)
def luhn_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def test_two_sentences(self): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem", "a", "ta",) returned = summarizer(document, 10) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta") self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def luhn(self,text_parser): assert isinstance(text_parser,plaintext.PlaintextParser) summarizer=Luhn() #EnglishStemmer()) #summarizer.stop_words=stopwords.words("english") summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE) return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
def test_two_sentences(): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem", "a", "ta",) returned = summarizer(document, 10) assert list(map(to_unicode, returned)) == [ "Já jsem 1. věta", "A já ta 2. vítězná výhra", ]
def sumy_luhn_summarizer(docx): parser = PlaintextParser.from_string(docx, Tokenizer("english")) luhn_summarizer = LuhnSummarizer() luhn_summarizer = LuhnSummarizer(Stemmer("english")) luhn_summarizer.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = luhn_summarizer(parser.document, 2) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def luhn_summarizer(data): text = data parser = PlaintextParser.from_string((text), sumytoken(LANGUAGE)) stemmer = Stemmer(LANGUAGE) print ("\n","*"*30, "LUHN SUMMARIZER", "*"*30) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT): result += str(sentence) return result
def searchGoogle(querystring): # to do -> handle exceptions, re-query on google if there is an exception by going to the next link and same for pdf and ppt num_page = 1 linkno = 0 while (True): # infinite loop to search for the answer for querystring until it is found try: print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print("QUERY --> " + str(querystring)) # print(querystring) searchresult = google.search(querystring, num_page) searchlink = searchresult[ linkno].link # this is the first link of the google search results...we will always go to the first link print("Search Link --> " + str(searchlink)) if searchlink[-4:] == '.pdf' or searchlink[-4:] == '.ppt': # go to next link id the current link is a ppt or pdf print("Can't include ppts or pdfs, trying next link on Google") linkno += 1 if linkno > 9: # if number of links on one page have been exceede, go to the next google link page num_page += 1 linkno = 0 else: LANGUAGE = "english" SENTENCES_COUNT = 10 parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Summarisation using Luhn Summarizer stopwords1 = set(stopwords.words('english')) datastring = '' # using the LuhnSummarizer summarizer = LuhnSummarizer() summarizer.stop_words = stopwords1 for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) datastring += str(sentence) return datastring except: linkno += 1 if linkno > 9: # if number of links on one page have been exceede, go to the next google link page num_page += 1 linkno = 0
def summarize_text(text): language = "english" parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer = LuhnSummarizer(Stemmer(language)) summarizer.stop_words = sumy.utils.get_stop_words(language) summary_text = "" for sentence in summarizer(parser.document, 5): summary_text += str(sentence) + " " return summary_text
def luhnReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def _get_summary(document): parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = STOP_WORDS summary = " " for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += " ".join(sentence.words) return summary
def luhnReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method): actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5 parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english")) stemmer = Stemmer("english") if (Summarize_Method == "Gensim"): #ratio: define length of the summary as a proportion of the text temp = summarize(Audio_Text, ratio=0.5) sen = sent_tokenize(temp) sen = Counter(sen) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LexRankSummarizer"): # Using LexRank(Sentence based ranking based on repeating sentences) summarizer_Lex = LexRankSummarizer(stemmer) summarizer_Lex.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = summarizer_Lex(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LuhnSummarizer"): # Using LUHN(Sentence based on frequency of most important words) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words("english") summary_1 = summarizer_luhn(parser.document, actual_sentences_count) sen = Counter(summary_1) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LsaSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_lsa2 = LsaSummarizer() summarizer_lsa2 = LsaSummarizer(stemmer) summarizer_lsa2.stop_words = get_stop_words("english") summary = summarizer_lsa2(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "TextRankSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_text = TextRankSummarizer() summarizer_text = TextRankSummarizer(stemmer) summarizer_text.stop_words = get_stop_words("english") summary = summarizer_text(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0])
def test_two_sentences_but_one_winner(self): document = build_document( ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta")) summarizer = LuhnSummarizer() summarizer.stop_words = ( "já", "jsem", "a", "ta", ) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "A já ta 2. vítězná věta")
def summarize_url(url,summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def Luhn(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = LuhnSummarizer(stemmer) # Luhn算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def webBrowse(): SENTENCES_COUNT = numOfSent.get() parser = HtmlParser.from_url(url.get(), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizerurl(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) outputFile = open("C://Users//rakesh chandra//Desktop//ATS//outputU.txt", 'w') for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) outputFile.write("-> ") outputFile.write(str(sentence)) outputFile.write("\n \n") os.startfile("C://Users//rakesh chandra//Desktop//ATS//outputU.txt")
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'): """ Create an extractive summary for a chapter of the book. Parameters: book_id: (str) the book identifier chapter: is the chapter number to summarize num_sentences: how many sentences to extract Returns: sentences: the extracted sentences """ chapter_filename = get_data_filename(book_id, 'book_chapters', chapter) parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english")) if technique == 'lsa': summarizer = LsaSummarizer() elif technique == 'lexrank': summarizer = LexRankSummarizer() elif technique == 'textrank': summarizer = TextRankSummarizer() elif technique == 'kl': summarizer = KLSummarizer() elif technique == 'random': summarizer = RandomSummarizer() elif technique == 'reduction': summarizer = ReductionSummarizer() elif technique == 'sumbasic': summarizer = SumBasicSummarizer() else: summarizer = LuhnSummarizer() summary = summarizer(parser.document, num_sentences) return summary
def luhn(docx,x): parser = PlaintextParser.from_string(docx, Tokenizer("english")) summarizer_1 = LuhnSummarizer() summary = summarizer_1(parser.document, x) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def createSummary (text, language="english", num_sentences=3, method="lexrank"): #LANGUAGE = "english" #SENTENCES_COUNT = 5 # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) # Language tokenizer tokenizer = Tokenizer(language) parser = PlaintextParser.from_string(text, tokenizer) # word stemming stemmer = Stemmer(language) if (method == "lexrank"): summarizer = LexRankSummarizer(stemmer) elif (method == "lsa"): summarizer = LSASummarizer(stemmer) elif (method == "luhn"): summarizer = LuhnSummarizer(stemmer) elif (method == "kl"): summarizer = KLSummarizer(stemmer) else: raise Exception (f'Unknown summarization method: ${method}') summarizer.stop_words = get_stop_words(language) result = [] for sentence in summarizer(parser.document, num_sentences): result.append (str(sentence)) return result
def Luhn_summarizer(docx): parser = PlaintextParser.from_string(docx, Tokenizer("english")) luhn_summarizer = LuhnSummarizer() summary_4 = luhn_summarizer(parser.document, 3) summary_list_4 = [str(sentence) for sentence in summary_4] result_4 = ' '.join(summary_list_4) return result_4
def LuhnSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = LuhnSummarizer() summary = summarizer(parser.document, sentences) # for sentence in summary: # print(sentence) return summary
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech") ) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def test_three_sentences(self): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa") self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.luhn import LuhnSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
class TestSentenceRating(unittest.TestCase): def setUp(self): self.summarizer = LuhnSummarizer() self.sentence = build_sentence( "Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře") def test_significant_words(self): self.summarizer.significant_percentage = 1/5 words = self.summarizer._get_significant_words(( "wa", "wb", "wc", "wd", "we", "wf", "wg", "wh", "wi", "wj", "wa", "wb", )) self.assertEqual(tuple(sorted(words)), ("wa", "wb")) def test_stop_words_not_in_significant_words(self): self.summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] words = self.summarizer._get_significant_words([ "stop", "Stop", "StOp", "STOP", "halt", "Halt", "HaLt", "HALT", "shut", "Shut", "ShUt", "SHUT", "hmmm", "Hmmm", "HmMm", "HMMM", "some", "relevant", "word", "some", "more", "relevant", "word", ]) self.assertEqual(tuple(sorted(words)), ("relevant", "some", "word")) def test_zero_rating(self): significant_stems = () self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_single_word(self): significant_stems = ("muž",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_single_word_before_end(self): significant_stems = ("život",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_single_word_at_end(self): significant_stems = ("samotáře",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_two_chunks_too_far(self): significant_stems = ("šel", "žil",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_two_chunks_at_begin(self): significant_stems = ("muž", "šel",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2) def test_two_chunks_before_end(self): significant_stems = ("pěkný", "život",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2) def test_two_chunks_at_end(self): significant_stems = ("pěkný", "samotáře",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 4/3) def test_three_chunks_at_begin(self): significant_stems = ("nějaký", "muž", "šel",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3) def test_three_chunks_at_end(self): significant_stems = ("pěkný", "život", "samotáře",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3) def test_three_chunks_with_gaps(self): significant_stems = ("muž", "šel", "zahrady",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/5) def test_chunks_with_user_gap(self): self.summarizer.max_gap_size = 6 significant_stems = ("muž", "šel", "pěkný",) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/8) def test_three_chunks_with_1_gap(self): sentence = build_sentence("w s w s w") significant_stems = ("w",) self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/5) def test_three_chunks_with_2_gap(self): sentence = build_sentence("w s s w s s w") significant_stems = ("w",) self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/7) def test_three_chunks_with_3_gap(self): sentence = build_sentence("w s s s w s s s w") significant_stems = ("w",) self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 1)
def setUp(self): self.summarizer = LuhnSummarizer() self.sentence = build_sentence( "Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře")