def cal_rouge_gain(src_txt, tgt_txt, c_idxs, rouge2_ratio=0.5, do_norm=True): ''' sents: all the sentences abstract: target ground truth text c_idxs: collection of selected indexs ''' sents = [sent.split() for sent in src_txt] abstract = tgt_txt.replace("<q>", " ").split() evaluated_1grams = [_get_word_ngrams(1, [sent], do_count=True) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract], do_count=True) evaluated_2grams = [_get_word_ngrams(2, [sent], do_count=True) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract], do_count=True) #print(reference_1grams, reference_2grams) #print(evaluated_1grams, evaluated_2grams) if c_idxs is None or len(c_idxs) == 0: rouge_score = 0.0 candidates_1 = coll.defaultdict(int) candidates_2 = coll.defaultdict(int) else: candidates_1 = [evaluated_1grams[idx] for idx in c_idxs if idx > -1] #fix a little bug #candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c_idxs if idx > -1] #candidates_2 = set.union(*map(set, candidates_2)) candidates_1 = union_dic(candidates_1) candidates_2 = union_dic(candidates_2) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 * (1-rouge2_ratio) + rouge_2 * rouge2_ratio #print(rouge_score) min_gain = float('inf') max_gain = float('-inf') rouge_gain = [float('-inf')] * len(sents) for i in range(len(sents)): if c_idxs is not None and i in c_idxs: continue unigrams = union_dic([candidates_1, evaluated_1grams[i]]) bigrams = union_dic([candidates_2, evaluated_2grams[i]]) #unigrams = candidates_1.union(set(evaluated_1grams[i])) #bigrams = candidates_2.union(set(evaluated_2grams[i])) #print(unigrams, bigrams) cur_rouge_1 = cal_rouge(unigrams, reference_1grams)['f'] cur_rouge_2 = cal_rouge(bigrams, reference_2grams)['f'] #print(cur_rouge_1, cur_rouge_2) cur_rouge_score = cur_rouge_1 * (1-rouge2_ratio) + cur_rouge_2 * rouge2_ratio #cur_rouge_score = cur_rouge_1 + cur_rouge_2 cur_gain = cur_rouge_score - rouge_score rouge_gain[i] = cur_gain max_gain = max(max_gain, cur_gain) min_gain = min(min_gain, cur_gain) gain_gap = max_gain - min_gain #print(gain_gap) rouge_gain = np.asarray(rouge_gain) #print(rouge_gain) if do_norm and gain_gap > 0: rouge_gain = (rouge_gain - min_gain) / gain_gap #same after softmax #do clip #rouge_gain[rouge_gain<0] = float('-inf') return rouge_gain
def combination_selection(doc_sent_list, abstract_sent_list, summary_size): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 max_idx = (0, 0) abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) impossible_sents = [] for s in range(summary_size + 1): combinations = itertools.combinations( [i for i in range(len(sents)) if i not in impossible_sents], s + 1) for c in combinations: candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 if (s == 0 and rouge_score == 0): impossible_sents.append(c[0]) if rouge_score > max_rouge: max_idx = c max_rouge = rouge_score return sorted(list(max_idx))
def cal_rouge_doc(src_txt, tgt_txt, c_idxs, c_masks=None): # one entry of src, tgt and selected sentences abstract = tgt_txt.replace("<q>", " ").split() reference_1grams = _get_word_ngrams(1, [abstract], do_count=True) reference_2grams = _get_word_ngrams(2, [abstract], do_count=True) #the following two are same if c_masks is None: sents = [src_txt[i].split() for i in c_idxs if i > -1] else: sents = [ src_txt[idx].split() for i, idx in enumerate(c_idxs) if c_masks[i] ] #-1 is padding id evaluated_1grams = [ _get_word_ngrams(1, [sent], do_count=True) for sent in sents ] evaluated_2grams = [ _get_word_ngrams(2, [sent], do_count=True) for sent in sents ] #candidates_1 = set.union(*map(set, evaluated_1grams)) #candidates_2 = set.union(*map(set, evaluated_2grams)) candidates_1 = union_dic(evaluated_1grams) candidates_2 = union_dic(evaluated_2grams) rouge_1 = cal_rouge(candidates_1, reference_1grams) rouge_2 = cal_rouge(candidates_2, reference_2grams) #print(rouge_1['p'], rouge_1['r'], rouge_1['f']) #print(rouge_2['p'], rouge_2['r'], rouge_2['f']) rouge_score = rouge_1['f'] + rouge_2['f'] return rouge_score
def presumm_reward3(doc_sent_list, abstract_sent_list, mode='f'): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] rwds = [] prev_sc = 0 # cur_max_rouge = max_rouge # cur_id = -1 for i in range(len(sents)): #source # if (i in selected): # continue c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)[mode] rouge_2 = cal_rouge(candidates_2, reference_2grams)[mode] rouge_score = rouge_1 + rouge_2 - prev_sc #max with it 0, use the r version, give -0.1, -0.2 ... for EOE rwds.append(rouge_score + 1) prev_sc = rouge_1 + rouge_2 return rwds
def get_label_orders(batch_src_sents, batch_tgt_str, batch_labels): #, temperature=20.0): '''batch_src_sents: batch of list of source sentences containing tokens batch_tgt_str: batch of ground truth summary batch_labels: batch of labels such as [0,0,0,1,0] indicating ground truth sentences return batch of orders according to max marginal gain [[2,1,0],[6,9]] before padding ''' # get rouge of selected sentences according to ground truth. # for each candidate sentence, get the rouge gain of each sentence. batch_size = len(batch_src_sents) batch_label_seq = [] for i in range(batch_size): label = batch_labels[i] rel_idxs = [idx for idx in range(len(label)) if label[idx] == 1] selected_idxs = [] src_txt, tgt_txt = batch_src_sents[i], batch_tgt_str[i] sents = [src_txt[x].split() for x in rel_idxs] abstract = tgt_txt.replace("<q>", " ").split() evaluated_1grams = [ _get_word_ngrams(1, [sent], do_count=True) for sent in sents ] reference_1grams = _get_word_ngrams(1, [abstract], do_count=True) evaluated_2grams = [ _get_word_ngrams(2, [sent], do_count=True) for sent in sents ] reference_2grams = _get_word_ngrams(2, [abstract], do_count=True) #print('ref', reference_1grams) idxs = list(range(len(rel_idxs))) #print('eval', evaluated_1grams) #print(rel_idxs) while len(idxs) > 0: cur_id = -1 cur_max_rouge = float('-inf') #cur_max_rouge should be larger than 0, in case of any illegal case, relax it to be possiblely negative for r_idx in idxs: c = selected_idxs + [r_idx] candidates_1 = [evaluated_1grams[idx] for idx in c] #candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] #candidates_2 = set.union(*map(set, candidates_2)) candidates_1 = union_dic(candidates_1) candidates_2 = union_dic(candidates_2) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 #print(candidates_1) #print(r_idx, c, rouge_score, cur_max_rouge) if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = r_idx #print(cur_id) assert cur_id > -1 idxs.remove(cur_id) selected_idxs += [cur_id] label_seq = [rel_idxs[x] for x in selected_idxs] batch_label_seq.append(label_seq) return batch_label_seq
def greedy_selection_given_context(src_txt, tgt_txt, c_idxs, summary_size=3): sents = [sent.split() for sent in src_txt] abstract = tgt_txt.replace("<q>", " ").split() evaluated_1grams = [ _get_word_ngrams(1, [sent], do_count=True) for sent in sents ] reference_1grams = _get_word_ngrams(1, [abstract], do_count=True) evaluated_2grams = [ _get_word_ngrams(2, [sent], do_count=True) for sent in sents ] reference_2grams = _get_word_ngrams(2, [abstract], do_count=True) #print(reference_1grams, reference_2grams) #print(evaluated_1grams, evaluated_2grams) max_rouge = 0.0 selected = [] if c_idxs is not None: for cid in c_idxs: if cid > -1: selected.append(cid) sel_count = len(selected) summary_sent_count = summary_size - len(selected) #print(sel_count) for s in range(summary_sent_count): cur_max_rouge = max_rouge cur_id = -1 for i in range(len(sents)): if (i in selected): continue c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] #candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] #candidates_2 = set.union(*map(set, candidates_2)) candidates_1 = union_dic(candidates_1) candidates_2 = union_dic(candidates_2) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if (cur_id == -1): return selected[sel_count:] selected.append(cur_id) max_rouge = cur_max_rouge return selected[sel_count:]
def cal_rouge_score(pred_list, truth_list): # one entry of src, tgt and selected sentences rouge1_arr, rouge2_arr = [], [] for pred_txt, tgt_txt in zip(pred_list, truth_list): gold = tgt_txt.replace("<q>", " ").split() reference_1grams = _get_word_ngrams(1, [gold], do_count=True) reference_2grams = _get_word_ngrams(2, [gold], do_count=True) candi = pred_txt.replace("<q>", " ").split() candidates_1 = _get_word_ngrams(1, [candi], do_count=True) candidates_2 = _get_word_ngrams(2, [candi], do_count=True) rouge_1 = cal_rouge(candidates_1, reference_1grams) rouge1_arr.append(rouge_1) rouge_2 = cal_rouge(candidates_2, reference_2grams) rouge2_arr.append(rouge_2) return rouge1_arr, rouge2_arr
def each_selection(doc_sent_list, abstract_sent_list): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 # abstract = sum(abstract_sent_list, []) # abstract = _rouge_clean(' '.join(abstract)).split() # sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] # evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] # reference_1grams = _get_word_ngrams(1, [abstract]) # evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] # reference_2grams = _get_word_ngrams(2, [abstract]) abstract = sum(abstract_sent_list, []) abstracts = [_rouge_clean(' '.join(s)).split() for s in abstract] sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = [_get_word_ngrams(1, [sent]) for sent in abstracts] evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = [_get_word_ngrams(2, [sent]) for sent in abstracts] selected = [] for _abstract in range(len(abstracts)): rouge_score = [] reference_1 = set(reference_1grams[_abstract]) reference_2 = set(reference_2grams[_abstract]) # print('------------------------------------------------------------') # print(len(sents)) # print(len(abstracts)) for _sent in range(len(sents)): if _sent in rouge_score: continue candidates_1 = set(evaluated_1grams[_sent]) candidates_2 = set(evaluated_2grams[_sent]) rouge_1 = cal_rouge(candidates_1, reference_1)['f'] rouge_2 = cal_rouge(candidates_2, reference_2)['f'] _rouge_score = rouge_1 + rouge_2 # print(_rouge_score) rouge_score.append(_rouge_score) # print('length',len(rouge_score)) if len(rouge_score) > 0: cur_index = rouge_score.index(max(rouge_score)) if cur_index not in selected: selected.append(cur_index) return sorted(selected)
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size): """ Algorithm used to create gold summaries from hand created target. It compare every sentence with hand maded one and calculate 2-ROUGE score. After that It sort the sentence the sentence by best score. Input: DOCUMENT, ABSTRACT GIVEN BY USER, SUMMARY MAX SIZE(number of sentence) Output: summary_size gold sentence which maximize the 2 rouge against hand written abstract. """ def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] for s in range(summary_size): cur_max_rouge = max_rouge cur_id = -1 for i in range(len(sents)): if (i in selected): continue #don't use duplicate c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if (cur_id == -1): return selected selected.append(cur_id) max_rouge = cur_max_rouge return sorted(selected)
def greedy_selection2(doc_sent_list, abstract_sent_list, summary_size): max_rouge = 0.0 #日本語の場合 doc_sent_list = doc_sent_list.split('。') doc_sent_list = [a + '。' for a in doc_sent_list] sents = doc_sent_list[:-1] abstract = tokenizer.tokenize(abstract_sent_list) sents = [tokenizer.tokenize(a) for a in sents] #abstract = _rouge_clean(' '.join(abstract_sent_list)).split() #sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] for s in range(summary_size): cur_max_rouge = max_rouge cur_id = -1 for i in range(len(sents)): if (i in selected): continue c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if (cur_id == -1): return selected selected.append(cur_id) max_rouge = cur_max_rouge print(type(sorted(selected))) return sorted(selected)
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] for s in range(summary_size): cur_max_rouge = max_rouge cur_id = -1 for i in range(len(sents)): #source if (i in selected): continue c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 # print(cur_max_rouge) if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if (cur_id == -1): return selected selected.append(cur_id) max_rouge = cur_max_rouge # print("output", selected) # if not selected: # print("empty selected") # selected.append(0) return selected #sorted(selected)
def sentence_rouge(doc_sent_list, abstract_sent_list): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) sent_scores = [] for i in range(len(sents)): candidates_1 = set.union(*map(set, [evaluated_1grams[i]])) candidates_2 = set.union(*map(set, [evaluated_2grams[i]])) rouge_1 = cal_rouge(candidates_1, reference_1grams) rouge_2 = cal_rouge(candidates_2, reference_2grams) sent_scores.append({'rouge_1': rouge_1, 'rouge_2': rouge_2}) return sent_scores
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size): def _rouge_clean(s): return re.sub(r"[^a-zA-Z0-9 ]", "", s) max_rouge = 0.0 abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(" ".join(abstract)).split() sents = [_rouge_clean(" ".join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] for s in range(summary_size): cur_max_rouge = max_rouge cur_id = -1 for i in range(len(sents)): if i in selected: continue c = selected + [i] candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)["f"] rouge_2 = cal_rouge(candidates_2, reference_2grams)["f"] rouge_score = rouge_1 + rouge_2 if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if cur_id == -1: return selected selected.append(cur_id) max_rouge = cur_max_rouge return sorted(selected)
def presumm_reward4(doc_sent_list, abstract_sent_list, n_sents=10): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] rwds = [0] # cur_max_rouge = max_rouge # cur_id = -1 for i in range(len(sents)): #source # if (i in selected): # continue # c = selected + [i] # candidates_1 = [evaluated_1grams[idx] for idx in c] # candidates_1 = set.union(*map(set, candidates_1)) # candidates_2 = [evaluated_2grams[idx] for idx in c] # candidates_2 = set.union(*map(set, candidates_2)) # rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] # rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] if i < n_sents: rouge_1 = 0 #cal_rouge(evaluated_1grams[i], reference_1grams)['f'] rouge_2 = cal_rouge(evaluated_2grams[i], reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 else: rouge_score = 0 rwds.append(rouge_score) return rwds[1:]
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size): """ Create pseudo extraction labels. Args: doc_sent_list (list[list[str]]): source text to be processed. abstract_sent_list (list[list[str]]): target text to be processed. summary_size(int) : maximum number of extracted sentences. Returns: A list of extracted sentence indices in ascending order. """ def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 # Clean and concat all target sentences abstract = sum(abstract_sent_list, []) abstract = _rouge_clean(' '.join(abstract)).split() # Clean all source sentences sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list] # Get 1 grams and 2 grams from source and target evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents] reference_1grams = _get_word_ngrams(1, [abstract]) evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents] reference_2grams = _get_word_ngrams(2, [abstract]) selected = [] for s in range(summary_size): cur_max_rouge = max_rouge cur_id = -1 # Iterates through all sentences for i in range(len(sents)): if (i in selected): continue # Consider selected and candidate sentences together c = selected + [i] # Calculate ROUGE-1-F + ROUGE-2-F with target candidates_1 = [evaluated_1grams[idx] for idx in c] candidates_1 = set.union(*map(set, candidates_1)) candidates_2 = [evaluated_2grams[idx] for idx in c] candidates_2 = set.union(*map(set, candidates_2)) rouge_1 = cal_rouge(candidates_1, reference_1grams)['f'] rouge_2 = cal_rouge(candidates_2, reference_2grams)['f'] rouge_score = rouge_1 + rouge_2 if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i # If no sentence exceeds current max score then stop if (cur_id == -1): return selected # Record currently chosen sentence and score selected.append(cur_id) max_rouge = cur_max_rouge return sorted(selected)