def summarize(text): if isvalid(text): all_capital = False # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on if text.upper() == text: text = text.lower() all_capital = True if PY2: parser = PlaintextParser.from_string( text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) else: parser = PlaintextParser.from_string( text.encode().decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [ str(s) for s in summarizer(parser.document, sentences_count=n_sentences) ] if all_capital: output_sentences = ' '.join(sentences).upper() all_capital = False else: output_sentences = ' '.join(sentences) return output_sentences else: return ''
def get_doc_summary(html, url): ''' Parse document text and extract summary with summarization algorithms. This is helpful when meta-desc tag is not available ''' from sumy.parsers.html import HtmlParser # from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 3 parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): res += str(sentence) return res
def test_single_sentence(self): document = build_document(("I am one sentence",)) summarizer = TextRankSummarizer() summarizer.stop_words = ("I", "am",) returned = summarizer(document, 10) self.assertEqual(len(returned), 1)
def __init__(self, modelfn=None, classnames=None, language="english", explainer=None, summarizer=None, fm=962, topfeaturescount=100, sentencescount=6, logger=None): self.fm = fm self.modelfn = modelfn self.classnames = classnames self.topfeaturescount = topfeaturescount self.language = language self.sentencescount = sentencescount if explainer is not None: self.explainer = explainer else: self.explainer = lime_text.LimeTextExplainer( class_names=self.classnames) if summarizer is not None: self.summarizer = summarizer else: self.summarizer = TextRankSummarizer(Stemmer(self.language)) self.summarizer.stop_words = get_stop_words(self.language) if logger is not None: self.log = logger else: self.log = logging.getLogger()
def articleSummerization(self, article, length): parser = PlaintextParser.from_string(article, Tokenizer("english")) stemmer = Stemmer("english") summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") return ' '.join([str(i) for i in summarizer(parser.document, length)])
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join( [obj._text for obj in summarizer(parser.document, length)]) return summary
def post(self): """ Extract summary (key sentences) from text """ # data = api.payload data = request.json text = data['text'] num_sentences = data['num_sentences'] num_sentences = num_sentences if isinstance( num_sentences, int) else DEFAULT_NUM_SENTENCES log.debug('num_sentences={}'.format(num_sentences)) # log.debug('text: {}'.format(text)) # TODO: check for minimum number of sentences in text? summary_sentences = [] if text: parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, num_sentences) # summary_text = ' '.join([sentence._text for sentence in summary]) summary_sentences = [sentence._text for sentence in summary] log.debug('response body:\n{}'.format(summary_sentences)) return summary_sentences, 200, {'Access-Control-Allow-Origin': '*'}
def sum_from_string(string, language="english", sentences_cout=100): parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stem_words = get_stop_words(language) sentences = summarizer(parser.document, sentences_cout) return sentences
def textrank(parser,sentence_count): summarizer_3 = TextRankSummarizer(Stemmer(language)) summarizer_3.stop_words = get_stop_words(language) summary_3 = summarizer_3(parser.document, sentence_count) temp = '' for sentence in summary_3: temp = temp + str(sentence) return (temp)
def textrank_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = TextRankSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def test_two_sentences(): document = build_document(("I am that 1. sentence", "And I am 2. winning prize")) summarizer = TextRankSummarizer() summarizer.stop_words = ("I", "am", "and", "that",) returned = summarizer(document, 10) assert len(returned) == 2 assert to_unicode(returned[0]) == "I am that 1. sentence" assert to_unicode(returned[1]) == "And I am 2. winning prize"
def test_two_sentences(self): document = build_document(("I am that 1. sentence", "And I am 2. winning prize")) summarizer = TextRankSummarizer() summarizer.stop_words = ("I", "am", "and", "that",) returned = summarizer(document, 10) self.assertEqual(len(returned), 2) self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence") self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
def sumy_tr_summarizer(docx): parser = PlaintextParser.from_string(docx, Tokenizer("english")) tr_summarizer = TextRankSummarizer() tr_summarizer = TextRankSummarizer(Stemmer("english")) tr_summarizer.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = tr_summarizer(parser.document, 2) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result
def summarize(corpus, length, algorithm): summarizer = None summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)" algorithm = algorithm.lower() try: parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(LANGUAGE)) if summarizer: summarizer.stop_words = get_stop_words(LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary except Exception as e: return str(e)
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary
def test_stop_words_correctly_removed(self): summarizer = TextRankSummarizer() summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] document = build_document( ("stop halt shut hmmm", "Stop Halt Shut Hmmm",), ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",), ("Some relevant sentence", "Some moRe releVant sentEnce",), ) sentences = document.sentences expected = [] returned = summarizer._to_words_set(sentences[0]) self.assertEqual(expected, returned) returned = summarizer._to_words_set(sentences[1]) self.assertEqual(expected, returned) returned = summarizer._to_words_set(sentences[2]) self.assertEqual(expected, returned) returned = summarizer._to_words_set(sentences[3]) self.assertEqual(expected, returned) expected = ["some", "relevant", "sentence"] returned = summarizer._to_words_set(sentences[4]) self.assertEqual(expected, returned) expected = ["some", "more", "relevant", "sentence"] returned = summarizer._to_words_set(sentences[5]) self.assertEqual(expected, returned)
def textSummary(data, SENTENCES_COUNT): LANGUAGE = "english" parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) x = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): x += ' {}'.format(str(sentence)) return x
def summarize(url, sent_count=10): """Automatic text summarizer https://pypi.python.org/pypi/sumy """ lang = "english" parser = HtmlParser.from_url(url, Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) summary = [str(sent) for sent in summarizer(parser.document, sent_count)] return (summary)
def test_three_sentences_but_second_winner(self): document = build_document([ "I am that 1. sentence", "And I am 2. sentence - winning sentence", "And I am 3. sentence - winner is my 2nd name", ]) summarizer = TextRankSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
def test_sentences_rating(): document = build_document([ "a c e g", "a b c d e f g", "b d f", ]) summarizer = TextRankSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] ratings = summarizer.rate_sentences(document) assert len(ratings) == 3 assert pytest.approx(sum(ratings.values())) == 1
def summarize(text): parser = PlaintextParser.from_string(text.decode( 'ascii', errors='ignore'), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [str(s) for s in summarizer( parser.document, sentences_count=n_sentences)] return ' '.join(sentences)
def textrankReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def sumy(text, LANGUAGE='english', COUNT=2): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) # summarizer = TextRankSummarizer() summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) summay_text = "" for sentence in summarizer(parser.document, COUNT): summay_text = summay_text + " " + str(sentence) summay_text = summay_text.strip() # summay_text = re.sub(' +', ' ', summay_text) return summay_text
def test_sentences_rating(self): document = build_document([ "a c e g", "a b c d e f g", "b d f", ]) summarizer = TextRankSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] ratings = summarizer.rate_sentences(document) self.assertEqual(len(ratings), 3) self.assertTrue(ratings[document.sentences[1]] > ratings[document.sentences[0]]) self.assertTrue(ratings[document.sentences[0]] > ratings[document.sentences[2]])
def textrankReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def run_summarizer(parser, sentences, language='english'): """ :params parser: Parser for selected document type :params sentences: Maximum sentences for summarizer. :returns summary: Summarized page. """ summarizer = Summarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return [ str(sentence) for sentence in summarizer(parser.document, sentences) ]
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method): actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5 parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english")) stemmer = Stemmer("english") if (Summarize_Method == "Gensim"): #ratio: define length of the summary as a proportion of the text temp = summarize(Audio_Text, ratio=0.5) sen = sent_tokenize(temp) sen = Counter(sen) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LexRankSummarizer"): # Using LexRank(Sentence based ranking based on repeating sentences) summarizer_Lex = LexRankSummarizer(stemmer) summarizer_Lex.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = summarizer_Lex(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LuhnSummarizer"): # Using LUHN(Sentence based on frequency of most important words) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words("english") summary_1 = summarizer_luhn(parser.document, actual_sentences_count) sen = Counter(summary_1) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LsaSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_lsa2 = LsaSummarizer() summarizer_lsa2 = LsaSummarizer(stemmer) summarizer_lsa2.stop_words = get_stop_words("english") summary = summarizer_lsa2(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "TextRankSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_text = TextRankSummarizer() summarizer_text = TextRankSummarizer(stemmer) summarizer_text.stop_words = get_stop_words("english") summary = summarizer_text(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0])
def TextRank(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = TextRankSummarizer(stemmer) # Luhn算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def summarizer(parser, sentences, language='english'): """ :params parser: Parser for selected document type :params sentences: Maximum sentences for summarizer. :returns summary: Summarized page. """ stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) output = [str(sentence) for sentence in summarizer(parser.document, sentences)] return ' '.join(output)
def test_rating_with_zero_or_single_words_in_sentences(sentences, expected_ratings): """ This is an edge-case test when the sentence(s) have only one word or even zero words. This test makes me sure the logic will not break when such a case is encountered. """ document = build_document(sentences) summarizer = TextRankSummarizer() ratings = summarizer.rate_sentences(document) assert ratings == { document.sentences[0]: pytest.approx(expected_ratings[0]), document.sentences[1]: pytest.approx(expected_ratings[1]), }
def TextRankSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = TextRankSummarizer() summary = summarizer(parser.document, sentences) # for sentence in summary: # print(sentence) return summary
def test_sentences_rating(): document = build_document([ "a c e g", "a b c d e f g", "b d f", ]) summarizer = TextRankSummarizer() ratings = summarizer.rate_sentences(document) assert ratings == { document.sentences[0]: pytest.approx(0.29714368215098025), document.sentences[1]: pytest.approx(0.42683373199392705), document.sentences[2]: pytest.approx(0.2760223553913001), } assert pytest.approx(sum(ratings.values())) == 1
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'): """ Create an extractive summary for a chapter of the book. Parameters: book_id: (str) the book identifier chapter: is the chapter number to summarize num_sentences: how many sentences to extract Returns: sentences: the extracted sentences """ chapter_filename = get_data_filename(book_id, 'book_chapters', chapter) parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english")) if technique == 'lsa': summarizer = LsaSummarizer() elif technique == 'lexrank': summarizer = LexRankSummarizer() elif technique == 'textrank': summarizer = TextRankSummarizer() elif technique == 'kl': summarizer = KLSummarizer() elif technique == 'random': summarizer = RandomSummarizer() elif technique == 'reduction': summarizer = ReductionSummarizer() elif technique == 'sumbasic': summarizer = SumBasicSummarizer() else: summarizer = LuhnSummarizer() summary = summarizer(parser.document, num_sentences) return summary
def _summ_score(storyName, highlightName): parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) geneSen = summarizer(parser.document, SENTENCES_COUNT) refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences #print geneSen #print "==========" #print refSen try: return evaluate(geneSen, refSen) except Exception as e: print storyName print e raise e
def summarize(): final = [] # Checking the integrity of the url query url = request.args.get('url') if(url == None or url == ""): return abort(400) # Checking the integrity of the num query try: num = int(request.args.get('num')) num = MIN_SENTENCES_COUNT if num < MIN_SENTENCES_COUNT else num num = MAX_SENTENCES_COUNT if num > MAX_SENTENCES_COUNT else num except (ValueError, TypeError) as e: num = MIN_SENTENCES_COUNT # Handles error where url is not a valid url try: parser = Parser.from_url(url, Tokenizer(LANGUAGE)) except (requests.exceptions.MissingSchema, requests.exceptions.HTTPError) as e: try: parser = Parser.from_url("http://" + url, Tokenizer(LANGUAGE)) except: return "URL is not valid.", 403 stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) # Take each sentence and append for sentence in summarizer(parser.document, num): # unidecode takes unicode characters and converts it into ASCII final.append(unidecode(str(sentence))) return json.dumps({"title": parser.get_title(), "content":final})
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)