def sumbasic(parser, sentence_count): summarizer_5 = SumBasicSummarizer(Stemmer(language)) summarizer_5.stop_words = get_stop_words(language) summary_5 = summarizer_5(parser.document, 5) temp = '' for sentence in summary_5: temp = temp + str(sentence) return (temp)
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def sumbasic_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = SumBasicSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def sumbasicReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def sumbasicReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def SumBasic(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = SumBasicSummarizer(stemmer) # LSA算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def summary(url): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print(type(sentence)) res.append(sentence._text) return res
def summarize(test_path, decoder_path): summarizers = { 'lexrank': LexRankSummarizer(), 'lsa': LsaSummarizer(), 'sumbasic': SumBasicSummarizer(), 'textrank': TextRankSummarizer() } for each in ['lexrank', 'lsa', 'sumbasic', 'textrank']: print("###################### %s #######################" % each) files = list(iter_files(test_path)) dec_dir = join(decoder_path, each, 'output') if not os.path.exists(dec_dir): os.makedirs(dec_dir) summarizer = summarizers[each] for file in tqdm(files): name = os.path.basename(file) name, _ = os.path.splitext(name) save_path = join(dec_dir, name + '.dec') article = ' '.join(json.load(open(file))['article']) article = PlaintextParser.from_string(article, Tokenizer('english')) output = summarizer(article.document, sentences_count=5) output = [each._text for each in output] with open(save_path, 'w') as f: f.write('\n'.join(output))
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'): """ Create an extractive summary for a chapter of the book. Parameters: book_id: (str) the book identifier chapter: is the chapter number to summarize num_sentences: how many sentences to extract Returns: sentences: the extracted sentences """ chapter_filename = get_data_filename(book_id, 'book_chapters', chapter) parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english")) if technique == 'lsa': summarizer = LsaSummarizer() elif technique == 'lexrank': summarizer = LexRankSummarizer() elif technique == 'textrank': summarizer = TextRankSummarizer() elif technique == 'kl': summarizer = KLSummarizer() elif technique == 'random': summarizer = RandomSummarizer() elif technique == 'reduction': summarizer = ReductionSummarizer() elif technique == 'sumbasic': summarizer = SumBasicSummarizer() else: summarizer = LuhnSummarizer() summary = summarizer(parser.document, num_sentences) return summary
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join( [obj._text for obj in summarizer(parser.document, length)]) return summary
def SumBasicSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = SumBasicSummarizer() summary = summarizer(parser.document, sentences) # for sentence in summary: # print(sentence) return summary
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def SumBasicSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = SumBasicSummarizer() summary = summarizer(parser.document, sentences) results = [] for sentence in summary: results.append(str(sentence)) return results
def __init__(self): """ Oracle summariser is not an actual, usable summariser. It extracts the best sentences from the paper possible by comparing them to the gold summaries. It represents the high-water mark in what ROUGE score it is possible for a summariser to achieve. """ self.summary_length = 10 self.summariser = SumBasicSummarizer()
def generate_benchmark_summary(filename, num_summary): parser = PlaintextParser.from_file( 'data/text_summary/' + filename + '.txt', Tokenizer("english")) print('=========== Basic Sum ============') Basic_Sum_sentences = [] summarizer = SumBasicSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence Basic_Sum_sentences.append(str(sentence)) print('=========== LSA ============') LSA_sentences = [] summarizer = LsaSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence LSA_sentences.append(str(sentence)) print('===========LexRank============') LexRank_sentences = [] summarizer = LexRankSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence LexRank_sentences.append(str(sentence)) print('===========KL Divergence============') KL_sentences = [] summarizer = KLSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence KL_sentences.append(str(sentence)) print('===========Luhn============') Luhn_sentences = [] summarizer = LuhnSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence Luhn_sentences.append(str(sentence)) return Basic_Sum_sentences, LSA_sentences, LexRank_sentences, KL_sentences, Luhn_sentences
def __init__(self, num_sentence, trim_len=5000): self.num_sentence = num_sentence self.trim_len = trim_len self.tokenizer = Tokenizer('english') self.summarizers = [ LexRankSummarizer(), LsaSummarizer(), SumBasicSummarizer() ] self.num_summarizers = len(self.summarizers)
def basic_sum(file, test_ratio=0.10, israndom=True): # extract test files file_lines = file.read().splitlines() nsamples = len(file_lines) ntests = int(nsamples * test_ratio) if israndom: seq = np.random.permutation(nsamples) else: seq = np.arange(nsamples) # summerizer stemmer = Stemmer(_language) summarizer = Summarizer (stemmer) summarizer.stop_words = get_stop_words(_language) # rouge rouge = Rouge155() scores = defaultdict(list) for i in range(ntests): line = file_lines[seq[i]] sample = json.loads(line) content = sample['content'] title = sample['title'] ref_text = {'A': title} doc = ' '.join(content) parser = PlaintextParser.from_string(doc, Tokenizer(_language)) sum_sents = summarizer(parser.document, _sent_count) if len(sum_sents) != _sent_count: continue summary = str(sum_sents[0]) score = rouge.score_summary(summary, ref_text) for k, v in score.items(): scores[k].append(v) print('{} / {} processed.'.format(i, ntests), end='\r') result = {} for k, v in scores.items(): result[k] = mean(v) return result
def evaluate_summary(file_name, input_dir, sent_count, lingua_franca_summary, show_summaries): method_name = inspect.stack()[0][3] try: process_logger.debug("in "+ method_name +" method") file_model_summary = open(input_dir + file_name +".model", "r") model_summary = file_model_summary.read() rouge_scores_dict = {} rouge_scores = rouge_evaluation(lingua_franca_summary, model_summary) rouge_scores_dict[">>LINGUA FRANCA"] = rouge_scores file_summary = open("Test System Summary/" + file_name + "-" + "LINGUA FRANCA" + ".txt", "w") file_summary.write(lingua_franca_summary) LANGUAGE = "english" parser = PlaintextParser.from_file(input_dir + file_name + ".txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) lsa_summarizer = LsaSummarizer(stemmer) rouge_scores = sumy_summarizers("LSA", lsa_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LSA"] = rouge_scores lex_summarizer = LexRankSummarizer(stemmer) rouge_scores = sumy_summarizers("LEX RANK", lex_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LEX RANK"] = rouge_scores luhn_summarizer = LuhnSummarizer(stemmer) rouge_scores = sumy_summarizers("LUHN", luhn_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LUHN"] = rouge_scores text_rank_summarizer = TextRankSummarizer(stemmer) rouge_scores = sumy_summarizers("TEXT RANK", text_rank_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["TEXT RANK"] = rouge_scores sum_basic_summarizer = SumBasicSummarizer(stemmer) rouge_scores = sumy_summarizers("SUM BASIC", sum_basic_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["SUM BASIC"] = rouge_scores kl_summarizer = KLSummarizer(stemmer) rouge_scores = sumy_summarizers("KL SUM", kl_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["KL SUM"] = rouge_scores # score_reader(rouge_scores_dict) df_rouge, summarizer_list = process_rouge_scores(rouge_scores_dict) return df_rouge, summarizer_list except Exception as Ex: error_logger.error("Exception occurred in " + method_name + "| Exception:" + str(Ex)) return None
def sum_basic(nb_sentences): """ Sumbasic function for automatic summarization. INPUT: ----- nb_sentences the number of sentences for the final event summaries. OUTPUT: ------ Done It means that the methods has finished and the result is available in the folder called 'SumBasicResult'. """ # Create the final folder that will contain event summaries try: path = "SumBasicResult/" + str(nb_sentences) os.system("mkdir SumBasicResult") os.mkdir(path) except OSError: print("Folder SumBasicResult already created !") # Construct the path to the post-processed events collection event_collection = glob.glob1("FinalCollection", "*.txt") event_collection = [ "FinalCollection/" + event for event in event_collection ] for event in event_collection: # Define a Plaintext parser for event text files parser = PlaintextParser.from_file(event, Tokenizer("english")) # Get the event id var = os.path.splitext(event)[0] id_event = var.replace("FinalCollection/", "") # Get the event total number of lines (sentences) with open("FinalCollection/" + id_event + ".txt", encoding="utf-8") as file: nb_lines = len(file.readlines()) # Summarize the document using Sumbasic method. We keep at the end nb_sentences sentences. summarizer = SumBasicSummarizer() summary = summarizer(parser.document, min(nb_sentences, nb_lines)) event_summary = open( 'SumBasicResult/' + str(nb_sentences) + '/' + id_event + '.txt', "wb") for sentence in summary: event_summary.write( ((str(sentence) + "\r\n")).encode('utf-8', 'ignore')) event_summary.close() return "Done"
def choose_summarizer(self, summarizer_string: str): logging.debug("Changing summarizer to: {}".format(summarizer_string)) if summarizer_string == "LexRank": # LexRank self.summarizer = LexRankSummarizer(stemmer) elif summarizer_string == "TextRank": # TextRank self.summarizer = TextRankSummarizer(stemmer) elif summarizer_string == "Luhn": # Luhn self.summarizer = LuhnSummarizer(stemmer) elif summarizer_string == "LSA": # LSA self.summarizer = LsaSummarizer(stemmer) elif summarizer_string == "SumBasic": # SumBasic self.summarizer = SumBasicSummarizer(stemmer) # allow summarizer to take stop words into account self.summarizer.stop_words = get_stop_words(LANGUAGE)
def get_summarizers(self, names): """Retrieves sumy summarizers algorithms Parameters: names (list): list of summarizer algorithm names Returns: dict:summarizers """ summarizers = {} for name in names: if name == "random": from sumy.summarizers.random import RandomSummarizer summarizers["random"] = RandomSummarizer(null_stemmer) elif name == "luhn": from sumy.summarizers.luhn import LuhnSummarizer summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer) elif name == "lsa": from sumy.summarizers.lsa import LsaSummarizer summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer) elif name == "lexrank": from sumy.summarizers.lex_rank import LexRankSummarizer summarizers["lexrank"] = LexRankSummarizer(null_stemmer) elif name == "textrank": from sumy.summarizers.text_rank import TextRankSummarizer summarizers["textrank"] = TextRankSummarizer(null_stemmer) elif name == "sumbasic": from sumy.summarizers.sum_basic import SumBasicSummarizer summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer) elif name == "kl-sum": from sumy.summarizers.kl import KLSummarizer summarizers["kl-sum"] = KLSummarizer(null_stemmer) elif name == "reduction": from sumy.summarizers.reduction import ReductionSummarizer summarizers["reduction"] = ReductionSummarizer(null_stemmer) for _, summarizer in summarizers.items(): summarizer.stop_words = frozenset( self.stop_words._get_stop_words(custom_stop_words=[])) return summarizers
def __init__(self, method=None, nltk_directory=None, language=None): if (language): logger.info("Setting language to " + language) LANGUAGE = language else: LANGUAGE = "english" # Set the location of the nltk data directory for tokenizers, etc. if nltk_directory: nltk.data.path.append(nltk_directory) logger.info(nltk.data.path) try: self.stemmer = Stemmer(LANGUAGE) except Exception: logger.exception("Error loading nltk stemmer") raise Exception("Error loading nltk stemmer") self.summarizer = Summarizer(self.stemmer) # default if method: if (method == 'luhn'): logger.info("Using the Luhn summarizer!") self.summarizer = LuhnSummarizer(self.stemmer) elif (method == 'edmundson'): logger.info("Using the Edmundson summarizer!") self.summarizer = EdmundsonSummarizer(self.stemmer) elif (method == 'lsa'): logger.info("Using the LSA summarizer!") self.summarizer = LsaSummarizer(self.stemmer) elif (method == 'text_rank'): logger.info("Using the Text Rank summarizer!") self.summarizer = TextRankSummarizer(self.stemmer) elif (method == 'sum_basic'): logger.info("Using the Sum Basic summarizer!") self.summarizer = SumBasicSummarizer(self.stemmer) elif (method == 'kl'): logger.info("Using the KL summarizer!") self.summarizer = KLSummarizer(self.stemmer) elif (method == 'lex_rank'): logger.info("Using the LexRank summarizer!") self.summarizer = LexRankSummarizer(self.stemmer) #print(method) self.summarizer.stop_words = get_stop_words(LANGUAGE)
def summary_benchmarks(sentences_string): ''' :param sentences_string: all sentences as one string, has been tokenized :return: ''' parser = PlaintextParser.from_string(sentences_string, Tokenizer("english")) print('=========== Basic Sum ============') summarizer = SumBasicSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('=========== LSA ============') summarizer = LsaSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========LexRank============') summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========KL Divergence============') summarizer = KLSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========Luhn============') summarizer = LuhnSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence
def __init__(self, name): self.stemmer = Stemmer('english') self.name = name if name == "TextRankSummarizer": self.summarizer = TextRankSummarizer(self.stemmer) elif name == "LsaSummarizer": self.summarizer = LsaSummarizer(self.stemmer) elif name == "LuhnSummarizer": self.summarizer = LuhnSummarizer(self.stemmer) elif name == "LexRankSummarizer": setattr(LexRankSummarizer, 'rate_sentences', rate_sentences) self.summarizer = LexRankSummarizer(self.stemmer) elif name == "SumBasicSummarizer": self.summarizer = SumBasicSummarizer(self.stemmer) elif name == "KLSummarizer": self.summarizer = KLSummarizer(self.stemmer) #summarizer = EdmundsonSummarizer(stemmer) self.summarizer.stop_words = get_stop_words('english')
def _build_summarizer(self, stop_words): summarizer = SumBasicSummarizer() summarizer.stop_words = stop_words return summarizer
def _build_summarizer(stop_words, stemmer=None): summarizer = SumBasicSummarizer( ) if stemmer is None else SumBasicSummarizer(stemmer) summarizer.stop_words = stop_words return summarizer
import os #create folder def createFolder(directory): try: if not os.path.exists(directory): os.makedirs(directory) except OSError: print('Error: Creating directory. ' + directory) LANGUAGE = "bangla" SENTENCES_COUNT = 2 if __name__ == "__main__": createFolder('Dataset/NCTB/SumBasicSummary/') for i in range(1, 140): serial_no = str(i) path = "Dataset/NCTB/Source/" + serial_no + ".txt" parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summary = summary + " " + str(sentence) fi = open('Dataset/NCTB/SumBasicSummary/' + serial_no + '.txt', '+w') fi.write(summary)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.sum_basic import SumBasicSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def run_SumBasic(stemmer, document, n): luhn = SumBasicSummarizer(stemmer) luhn.stop_words = get_stop_words(LANGUAGE) print("SumBasic: {}".format(n)) res = luhn(document, SENTENCES_COUNT) return " ".join(str(res[0]).split()[:n])
from sumy.nlp.tokenizers import Tokenizer import sys def leadSummariser(document, no_of_sents): for sent in document.sentences[:no_of_sents]: yield str(sent) summarisers = { "lead": leadSummariser, "luhn": LuhnSummarizer(), "lsa": LsaSummarizer(), "lex_rank": LexRankSummarizer(), "text_rank": TextRankSummarizer(), "sum_basic": SumBasicSummarizer(), "kl": KLSummarizer() } tokenizer = Tokenizer("english") def to_words(str): return str.split(" ") def extractive(article, title=None): raw = article.replace(' <sb>', '').strip() parser = PlaintextParser.from_string(raw, tokenizer)
def __summarize(self, parser): summarizer = SumBasicSummarizer(Stemmer(self.__language)) summarizer.stop_words = get_stop_words(self.__language) final_sentences = summarizer(parser.document, self.__sentences_count) return self.__join_sentences(final_sentences)