def test_single_sentence(): document = build_document(("Já jsem jedna věta",)) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem",) returned = summarizer(document, 10) assert len(returned) == 1
def models_LUHN_LEX_LSA_2(dataframe): LANGUAGE = "english" stop = get_stop_words(LANGUAGE) size = len(dataframe) stemmer = Stemmer(LANGUAGE) for i in range(0, size): article = dataframe.loc[i, "post_content"] parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence1 in LUHNsentence: LUHNsummary = sentence1 for sentence2 in LEXsentence: LEXsummary = sentence2 for sentence3 in LSAsentence: LSAsummary = sentence3 dataframe.loc[i, "LUHN"] = LUHNsummary dataframe.loc[i, "LEX"] = LEXsummary dataframe.loc[i, "LSA"] = LSAsummary
def test_single_sentence(self): document = build_document(("Já jsem jedna věta",)) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem",) returned = summarizer(document, 10) self.assertEqual(len(returned), 1)
def summarize(final_transcript, askuser=False): print('Summarizing transcript...') parser = PlaintextParser.from_file(final_transcript, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if askuser == True: summtype = input('Summarizer type? [1: Luhn, 2: Lex-Rank, 3: Text-Rank] ') else: summtype = SUMMMETHOD if summtype == '1': summarizer = LuhnSummarizer(stemmer) typename = 'luhn' elif summtype == '2': summarizer = LexRankSummarizer(stemmer) typename = 'lex' elif summtype == '3': summarizer = TextRankSummarizer(stemmer) typename = 'tex' summarizer.stop_words = get_stop_words(LANGUAGE) count = SENTENCES_COUNT summaryfile = str(final_transcript)[:-4] + '_summ_' + typename + '.txt' for sentence in summarizer(parser.document, SENTENCES_COUNT): sentence_out = str(SENTENCES_COUNT - count + 1) + ':\n' + str(sentence) + '\n--------------\n' with open(summaryfile, 'a') as f: f.write(sentence_out) print(sentence_out) count -= 1 return summaryfile
def test_various_words_with_significant_percentage(): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "6 e e e e e", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "5 z z z z", "6 e e e e e", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "3 c c c", "5 z z z z", "6 e e e e e", ]
def test_three_sentences(): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ]
def test_three_sentences(): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ]
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def summarize_url(url,summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def test_various_words_with_significant_percentage(self): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "6 e e e e e") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "5 z z z z") self.assertEqual(to_unicode(returned[1]), "6 e e e e e") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "3 c c c") self.assertEqual(to_unicode(returned[1]), "5 z z z z") self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
def test_three_sentences(self): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s", ) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa") self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
def test_various_words_with_significant_percentage(self): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "6 e e e e e") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "5 z z z z") self.assertEqual(to_unicode(returned[1]), "6 e e e e e") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "3 c c c") self.assertEqual(to_unicode(returned[1]), "5 z z z z") self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
def test_various_words_with_significant_percentage(): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "6 e e e e e", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "5 z z z z", "6 e e e e e", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "3 c c c", "5 z z z z", "6 e e e e e", ]
def luhn(parser,sentence_count): summarizer_1 = LuhnSummarizer(Stemmer(language)) summarizer_1.stop_words = get_stop_words(language) summary_1 = summarizer_1(parser.document, sentence_count) temp = '' for sentence in summary_1: temp = temp + str(sentence) return (temp)
def luhn_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def test_two_sentences(self): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem", "a", "ta",) returned = summarizer(document, 10) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta") self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def luhn_summarizer(text, stemmer, LANGUAGE, SENTENCES_COUNT): parser = PlaintextParser.from_string(text, sumytoken(LANGUAGE)) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT): a = sentence sentences.append(str(a)) return " ".join(sentences)
def luhn(self,text_parser): assert isinstance(text_parser,plaintext.PlaintextParser) summarizer=Luhn() #EnglishStemmer()) #summarizer.stop_words=stopwords.words("english") summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE) return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
def test_two_sentences(): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem", "a", "ta",) returned = summarizer(document, 10) assert list(map(to_unicode, returned)) == [ "Já jsem 1. věta", "A já ta 2. vítězná výhra", ]
def test_two_sentences(): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() summarizer.stop_words = ("já", "jsem", "a", "ta",) returned = summarizer(document, 10) assert list(map(to_unicode, returned)) == [ "Já jsem 1. věta", "A já ta 2. vítězná výhra", ]
def sumy_luhn_summarizer(docx): parser = PlaintextParser.from_string(docx, Tokenizer("english")) luhn_summarizer = LuhnSummarizer() luhn_summarizer = LuhnSummarizer(Stemmer("english")) luhn_summarizer.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = luhn_summarizer(parser.document, 2) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result
def _get_summary(document): parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = STOP_WORDS summary = " " for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += " ".join(sentence.words) return summary
def summarize_text(text): language = "english" parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer = LuhnSummarizer(Stemmer(language)) summarizer.stop_words = sumy.utils.get_stop_words(language) summary_text = "" for sentence in summarizer(parser.document, 5): summary_text += str(sentence) + " " return summary_text
def searchGoogle(querystring): # to do -> handle exceptions, re-query on google if there is an exception by going to the next link and same for pdf and ppt num_page = 1 linkno = 0 while (True): # infinite loop to search for the answer for querystring until it is found try: print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print("QUERY --> " + str(querystring)) # print(querystring) searchresult = google.search(querystring, num_page) searchlink = searchresult[ linkno].link # this is the first link of the google search results...we will always go to the first link print("Search Link --> " + str(searchlink)) if searchlink[-4:] == '.pdf' or searchlink[-4:] == '.ppt': # go to next link id the current link is a ppt or pdf print("Can't include ppts or pdfs, trying next link on Google") linkno += 1 if linkno > 9: # if number of links on one page have been exceede, go to the next google link page num_page += 1 linkno = 0 else: LANGUAGE = "english" SENTENCES_COUNT = 10 parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Summarisation using Luhn Summarizer stopwords1 = set(stopwords.words('english')) datastring = '' # using the LuhnSummarizer summarizer = LuhnSummarizer() summarizer.stop_words = stopwords1 for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) datastring += str(sentence) return datastring except: linkno += 1 if linkno > 9: # if number of links on one page have been exceede, go to the next google link page num_page += 1 linkno = 0
def luhn_summarizer(data): text = data parser = PlaintextParser.from_string((text), sumytoken(LANGUAGE)) stemmer = Stemmer(LANGUAGE) print ("\n","*"*30, "LUHN SUMMARIZER", "*"*30) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT): result += str(sentence) return result
def luhnReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def luhnReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method): actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5 parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english")) stemmer = Stemmer("english") if (Summarize_Method == "Gensim"): #ratio: define length of the summary as a proportion of the text temp = summarize(Audio_Text, ratio=0.5) sen = sent_tokenize(temp) sen = Counter(sen) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LexRankSummarizer"): # Using LexRank(Sentence based ranking based on repeating sentences) summarizer_Lex = LexRankSummarizer(stemmer) summarizer_Lex.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = summarizer_Lex(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LuhnSummarizer"): # Using LUHN(Sentence based on frequency of most important words) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words("english") summary_1 = summarizer_luhn(parser.document, actual_sentences_count) sen = Counter(summary_1) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LsaSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_lsa2 = LsaSummarizer() summarizer_lsa2 = LsaSummarizer(stemmer) summarizer_lsa2.stop_words = get_stop_words("english") summary = summarizer_lsa2(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "TextRankSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_text = TextRankSummarizer() summarizer_text = TextRankSummarizer(stemmer) summarizer_text.stop_words = get_stop_words("english") summary = summarizer_text(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0])
def Luhn(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = LuhnSummarizer(stemmer) # Luhn算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def test_two_sentences_but_one_winner(self): document = build_document( ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta")) summarizer = LuhnSummarizer() summarizer.stop_words = ( "já", "jsem", "a", "ta", ) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "A já ta 2. vítězná věta")
def webBrowse(): SENTENCES_COUNT = numOfSent.get() parser = HtmlParser.from_url(url.get(), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizerurl(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) outputFile = open("C://Users//rakesh chandra//Desktop//ATS//outputU.txt", 'w') for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) outputFile.write("-> ") outputFile.write(str(sentence)) outputFile.write("\n \n") os.startfile("C://Users//rakesh chandra//Desktop//ATS//outputU.txt")
def get_content(url_name, extras): LANGUAGE = "english" SENTENCES_COUNT = 5 url = str(url_name) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) summarizer = Summarizer(Stemmer(LANGUAGE)) summarizer.stop_words = extras try: result = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): result.append(str(sentence)) result = ' '.join(map(str, result)) return {'result': result, 'source': url} except Exception as e: print('fail: ', e) return extras, sentence
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech")) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def models_LUHN_LEX_LSA(article): ## Candidate models: # Bag of Words # FastText # word2vec # LDA (topic extraction) # skip-thoughts # doc2vec # LSTM LANGUAGE = "english" stop = get_stop_words(LANGUAGE) stemmer = Stemmer(LANGUAGE) parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) result = [] summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence in LUHNsentence: LUHNsummary = sentence for sentence in LEXsentence: LEXsummary = sentence for sentence in LSAsentence: LSAsummary = sentence result.append(LUHNsummary) result.append(LEXsummary) result.append(LSAsummary) return result
def test_real_example(): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech") ) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) assert list(map(to_unicode, returned)) == [ "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.", "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.", ]
def simple_check(): SUMMARY_SENTENCES_COUNT = 5 LANGUAGE = "english" URL = "https://qz.com/1367800/ubernomics-is-ubers-semi-secret-internal-economics-department/" parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): print(sentence) html = urllib.request.urlopen(URL).read() soup = BeautifulSoup(html, features='html.parser') print(soup.prettify())
def main(req: func.HttpRequest) -> func.HttpResponse: ret = "" logging.info('Python HTTP trigger function processed a request.') text = str(req.get_body()) soup = BeautifulSoup(text, features="lxml") souped = soup.get_text() SENTENCES_COUNT = math.log2(souped.count('.')) parser = PlaintextParser.from_string(souped, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): ret += str(sentence) return func.HttpResponse(re.sub(r'\\\w{3}','',ret))
def get_summary_per_section_luhn(cur_sents, each_summ_num): summarizer = LuhnSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = ( "I", "am", "the", "you", "are", "me", "is", "than", "that", "this", ) parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE)) summ = summarizer(parser.document, each_summ_num) decoded = [] for line in summ: decoded.append(line._text) return decoded
def test_three_sentences(self): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa") self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb") self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.luhn import LuhnSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)