class LeadOracle(): def __init__(self, rouge_n=1, metric='f'): self.rouge_n = rouge_n self.metric = metric self.summarizer = Summarizer() def summarize(self, ref, articles, max_len=40, len_type='words', in_titles=False, out_titles=False, min_sent_tokens=7, max_sent_tokens=40): articles = self.summarizer._preprocess(articles) scored_summaries = [] for a in articles: selected_sents = [] current_len = 0 sents = a.sents if in_titles == False or out_titles == False: sents = [s for s in sents if not s.is_title] for s in sents: l = self.summarizer._sent_len(s, len_type) new_len = current_len + l if new_len <= max_len: selected_sents.append(s.text) current_len = new_len if new_len > max_len: break if len(selected_sents) >= 1: summary = ' '.join(selected_sents) rouge_scores = compute_rouge_n(summary, ref, self.rouge_n, tokenize=True) score = rouge_scores[self.metric] scored_summaries.append((summary, score)) scored_summaries.sort(key=lambda x: x[1], reverse=True) summary = scored_summaries[0][0] return summary
class Oracle(): def __init__(self, rouge_n=1, metric='f', early_stopping=True): self.rouge_n = rouge_n self.metric = metric self.early_stopping = early_stopping self.summarizer = Summarizer() def summarize(self, ref, articles, max_len=40, len_type='words', in_titles=False, out_titles=False, min_sent_tokens=7, max_sent_tokens=40): articles = self.summarizer._preprocess(articles) sents = [s for a in articles for s in a.sents] sents = self.summarizer._deduplicate(sents) if in_titles == False or out_titles == False: sents = [s for s in sents if not s.is_title] sent_lens = [self.summarizer._sent_len(s, len_type) for s in sents] current_len = 0 remaining = list(range(len(sents))) selected = [] scored_selections = [] ref_words = word_tokenize(ref) while current_len < max_len and len(remaining) > 0: scored = [] current_summary_words = [ tok for i in selected for tok in sents[i].words ] for i in remaining: new_len = current_len + sent_lens[i] if new_len <= max_len: try: summary_words = current_summary_words + sents[i].words rouge_scores = compute_rouge_n(summary_words, ref_words, rouge_n=self.rouge_n, tokenize=False) score = rouge_scores[self.metric] scored.append((i, score)) except: pass if len(scored) == 0: break scored.sort(key=lambda x: x[1], reverse=True) best_idx, best_score = scored[0] scored_selections.append((selected + [best_idx], best_score)) current_len += sent_lens[best_idx] selected.append(scored[0][0]) remaining.remove(best_idx) if self.early_stopping == False: # remove shorter summaries max_sents = max([len(x[0]) for x in scored_selections]) scored_selections = [ x for x in scored_selections if len(x[0]) < max_sents ] scored_selections.sort(key=lambda x: x[1], reverse=True) if len(scored_selections) == 0: return '' best_selection = scored_selections[0][0] summary_sents = [sents[i].text for i in best_selection] return ' '.join(summary_sents)