def Random(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = RandomSummarizer(stemmer) # LSA算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'): """ Create an extractive summary for a chapter of the book. Parameters: book_id: (str) the book identifier chapter: is the chapter number to summarize num_sentences: how many sentences to extract Returns: sentences: the extracted sentences """ chapter_filename = get_data_filename(book_id, 'book_chapters', chapter) parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english")) if technique == 'lsa': summarizer = LsaSummarizer() elif technique == 'lexrank': summarizer = LexRankSummarizer() elif technique == 'textrank': summarizer = TextRankSummarizer() elif technique == 'kl': summarizer = KLSummarizer() elif technique == 'random': summarizer = RandomSummarizer() elif technique == 'reduction': summarizer = ReductionSummarizer() elif technique == 'sumbasic': summarizer = SumBasicSummarizer() else: summarizer = LuhnSummarizer() summary = summarizer(parser.document, num_sentences) return summary
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def test_less_than_10_words_should_be_returned(): """https://github.com/miso-belica/sumy/issues/159""" document = build_document_from_string(""" # Heading one First sentence. Second sentence. Third sentence. # Heading two I like sentences but this one is really long. They are so wordy And have many many letters And are green in my editor But someone doesn't like them :( """) summarizer = RandomSummarizer() def count(max_words, sentence_infos): results = [] words_count = 0 for info in sentence_infos: words_count += len(info.sentence.words) if words_count > max_words: return results else: results.append(info) return results sentences = summarizer(document, partial(count, 10)) assert 0 < sum(len(s.words) for s in sentences) <= 10
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join( [obj._text for obj in summarizer(parser.document, length)]) return summary
def test_less_sentences_than_requested(): document = build_document_from_string(""" This is only one sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 10) assert len(sentences) == 1 assert to_unicode(sentences[0]) == "This is only one sentence."
def test_less_sentences_than_requested(self): document = build_document_from_string(""" This is only one sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 10) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "This is only one sentence.")
def test_sentences_in_right_order(self): document = build_document_from_string(""" # Heading one First sentence. Second sentence. Third sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 4) self.assertEqual(len(sentences), 3) self.assertEqual(to_unicode(sentences[0]), "First sentence.") self.assertEqual(to_unicode(sentences[1]), "Second sentence.") self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
def test_sentences_in_right_order(): document = build_document_from_string(""" # Heading one First sentence. Second sentence. Third sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 4) assert len(sentences) == 3 assert to_unicode(sentences[0]) == "First sentence." assert to_unicode(sentences[1]) == "Second sentence." assert to_unicode(sentences[2]) == "Third sentence."
def test_more_sentences_than_requested(self): document = build_document_from_string(""" # Heading one First sentence. Second sentence. Third sentence. # Heading two I like sentences They are so wordy And have many many letters And are green in my editor But someone doesn't like them :( """) summarizer = RandomSummarizer() sentences = summarizer(document, 4) self.assertEqual(len(sentences), 4)
def get_summarizers(self, names): """Retrieves sumy summarizers algorithms Parameters: names (list): list of summarizer algorithm names Returns: dict:summarizers """ summarizers = {} for name in names: if name == "random": from sumy.summarizers.random import RandomSummarizer summarizers["random"] = RandomSummarizer(null_stemmer) elif name == "luhn": from sumy.summarizers.luhn import LuhnSummarizer summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer) elif name == "lsa": from sumy.summarizers.lsa import LsaSummarizer summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer) elif name == "lexrank": from sumy.summarizers.lex_rank import LexRankSummarizer summarizers["lexrank"] = LexRankSummarizer(null_stemmer) elif name == "textrank": from sumy.summarizers.text_rank import TextRankSummarizer summarizers["textrank"] = TextRankSummarizer(null_stemmer) elif name == "sumbasic": from sumy.summarizers.sum_basic import SumBasicSummarizer summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer) elif name == "kl-sum": from sumy.summarizers.kl import KLSummarizer summarizers["kl-sum"] = KLSummarizer(null_stemmer) elif name == "reduction": from sumy.summarizers.reduction import ReductionSummarizer summarizers["reduction"] = ReductionSummarizer(null_stemmer) for _, summarizer in summarizers.items(): summarizer.stop_words = frozenset( self.stop_words._get_stop_words(custom_stop_words=[])) return summarizers
logger.info('Loading standardized docs') with open(base_dir + '/std_docs.dump', mode='rb') as f: std_docs = pickle.load(f) with open(base_dir + '/std_sums.dump', mode='rb') as f: std_sums = pickle.load(f) else: logger.error('You must first generate standardized docs and refs') exit(1) rouge = Rouge() summarizers = {'Luhn': LuhnSummarizer(), \ 'LexRank': LexRankSummarizer(), \ # 'Lsa': LsaSummarizer(), \ 'TextRank': TextRankSummarizer(), \ 'Random' : RandomSummarizer(), \ # 'KLSum': KLSummarizer(), \ 'SumBasic': SumBasicSummarizer()\ } text_sums = [] for i, summary in enumerate(std_sums): text_sums.append(' '.join([' '.join(line) for line in summary])) # Length of each document, used to guess the size of the summary x = [] for doc in std_docs: words = 0 for line in doc: words += len(line) x.append(words)
def test_empty_document(self): document = build_document() summarizer = RandomSummarizer() sentences = summarizer(document, 10) self.assertEqual(len(sentences), 0)
def test_empty_document(): document = build_document() summarizer = RandomSummarizer() sentences = summarizer(document, 10) assert len(sentences) == 0
def main(): print("\n\t\t SUMMARIZATION REVIEW\t\t\n") print('[INFO] Loading configuration') with open("./config.yml", 'r') as file: config_var = safe_load(file)["main"] data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" + str(config_var['data_to_use'])) # # # print("[INFO] Training sentence tokenizer for summary on all articles.") punkt_tokenizer = PunktSentenceTokenizer( train_text="\n".join([sent for sent in data["Paragraphs_as_string"]]) ) # # # len_sum = np.mean(data["Summary"].apply(lambda x: len(punkt_tokenizer.tokenize(x)))) print("[INFO] Average number of sentences in article summaries", len_sum) print("[COMMENT] Considering this value as reference to generate the automatic summaries.") len_sum = int(len_sum) # # # print("[INFO] Using "+str(config_var['language'])+"stenner") stemmer = Stemmer(config_var['language']) print("[INFO] Preparing summarizers") summarizer_dict = {"LSA":LsaSummarizer(stemmer), "Luhn": LuhnSummarizer(stemmer), "LexRank":LexRankSummarizer(stemmer), "SumBasics":SumBasicSummarizer(stemmer), "Random": RandomSummarizer(stemmer), "Reduction": ReductionSummarizer(stemmer)} print("[INFO] Preparing stopwords.") for summarizer in summarizer_dict.values(): summarizer.stop_words = get_stop_words('english') print("[INFO] Summaries preparation") dict_res = {} dict_summs = {} for name, summarizer in summarizer_dict.items(): print("[INFO] Method:", name) results_rouge_1 = [] results_rouge_2 = [] results_rouge_l_1 = [] results_rouge_l_2 = [] sums = {} for i in progressbar.progressbar(range(len(data))): (article, summary) = (data["Paragraphs_as_string"][i], data["Summary"][i]) parser = PlaintextParser.from_string( article, tokenizer=Tokenizer('english')) summaries = [ sentence for sentence in summarizer(parser.document, len_sum) ] summaries_str = [ str(sentence) for sentence in summarizer(parser.document, len_sum) ] # Append current summary results # Since there are problems with some documents # being skipped, I need to save the index as well sums[i] = (" ".join(summaries_str)) # To use sumy's evaluation functions, I need to have the text in # Sentence objects reference_sentences = [ Sentence(sent, tokenizer=Tokenizer("english")) for sent in punkt_tokenizer.tokenize(summary) ] try: results_rouge_1.append( rouge_1(evaluated_sentences=summaries, reference_sentences=reference_sentences)) except: results_rouge_1.append(np.nan) try: results_rouge_2.append( rouge_2(evaluated_sentences=summaries, reference_sentences=reference_sentences)) except: # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.") results_rouge_2.append(np.nan) try: results_rouge_l_1.append( rouge_l_sentence_level(evaluated_sentences=summaries, reference_sentences=reference_sentences)) except: # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.") results_rouge_l_1.append(np.nan) try: results_rouge_l_2.append( rouge_l_summary_level(evaluated_sentences=summaries, reference_sentences=reference_sentences)) except: # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.") results_rouge_l_2.append(np.nan) # Save results and progress to next summarizer dict_res[name] = { "Rouge_1": results_rouge_1, "Rouge_2": results_rouge_2, "Rouge_L_sentence_level": results_rouge_l_1, "Rouge_L_summary_level": results_rouge_l_2 } # Save summaries to dictionary dict_summs[name] = sums print("[INFO] Summaries and evaluations completed.") print("[INFO] Saving data to output.") # Create pandas dataframe for mean of results res_mean = pd.DataFrame(columns = dict_res.keys()) # Dataframe for std of results res_se = pd.DataFrame(columns = dict_res.keys()) for col in res_mean: res_mean[col] = pd.Series( {key: np.nanmean(value) for key, value in dict_res[col].items()}) res_se[col] = pd.Series( {key: np.nanstd(value)/np.sqrt(len(value)) for key, value in dict_res[col].items()}) print("[INFO] Saving evaluation averages.") with open(config_var['output_folder']+"/avgs.csv", 'w') as file: res_mean.to_csv(file) print("[INFO] Saving evaluations standard errors.") with open(config_var['output_folder']+"/ses.csv", 'w') as file: res_se.to_csv(file) print("[INFO] Saving to json all produced summaries.") with open(config_var['output_folder']+"/summaries.json", 'w') as file: json.dump(dict_summs, file) print("[INFO] Program completed successfully.")
stemming=True, stopwords=False, word_level=True, length_limit=True, length=600, use_cf=False, cf=95, scoring_formula='best', resampling=True, samples=1, favor=True, p=0.5) score6 = r6.calc_score() summarizer7 = RandomSummarizer() summary7 = summarizer7(parser.document, 10) for sentence in summary7: generated7 = generated7 + " " + sentence._text candidate7 = generated7.split(" ") b7 = [] b7.append(sentence_bleu(ref_bleu, candidate7, weights=(1, 0, 0, 0))) #1 gram b7.append(sentence_bleu(ref_bleu, candidate7, weights=(0, 1, 0, 0))) #2 gram b7.append(sentence_bleu(ref_bleu, candidate7, weights=(0, 0, 1, 0))) #3 gram b7.append(sentence_bleu(ref_bleu, candidate7,
import os #create folder def createFolder(directory): try: if not os.path.exists(directory): os.makedirs(directory) except OSError: print('Error: Creating directory. ' + directory) LANGUAGE = "bangla" SENTENCES_COUNT = 2 if __name__ == "__main__": createFolder('Dataset/NCTB/RandomSummary/') for i in range(1, 140): serial_no = str(i) path = "Dataset/NCTB/Source/" + serial_no + ".txt" parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summary = summary + " " + str(sentence) fi = open('Dataset/NCTB/RandomSummary/' + serial_no + '.txt', '+w') fi.write(summary)