def CallRougeForOneSum(hyp_sents, ref_sents, _ngram=1, _l=False, _w=False, _su4=False, _stem=True, _stopw=True, _lenlmt=False, _len=200): rouge = Pythonrouge(summary_file_exist=False, summary=[hyp_sents], reference=[ref_sents], xml_dir=rouge_xml_dir, n_gram=_ngram, ROUGE_L=_l, ROUGE_W=_w, ROUGE_SU4=_su4, stemming=_stem, stopwords=_stopw, length_limit=_lenlmt, length=_len) score = rouge.calc_score() return score
def test_rouge_with_length_limit(self): data = self.load_test_data() rouge = RougeCalculator(stopwords=True, length_limit=50) for eval_id in data: summaries = data[eval_id]["summaries"] references = data[eval_id]["references"] for n in [1, 2]: for s in summaries: baseline = Pythonrouge(summary_file_exist=False, summary=[[s]], reference=[[[r] for r in references]], n_gram=n, recall_only=False, length_limit=True, length=50, stemming=False, stopwords=True) b1_v = baseline.calc_score() b2_v = rouge_n(rouge.preprocess(s), [rouge.preprocess(r) for r in references], n, 0.5) v = rouge.rouge_n(s, references, n) self.assertLess(abs(b2_v - v), 1e-5) self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v), 1e-5) # noqa
def lead3_baseline(path_file): with open(path_file, 'r') as fd: lines = fd.read().splitlines() pred_str_bag, ref_str_bag = [], [] for l in lines: name, doc, abst, span_info, gold = l.split('\t') doc = doc.split() indices = [i for i, x in enumerate(doc) if x == "@@SS@@"] abs_str = abst.replace("@@SS@@", "\n").split("\n") abs_str = [x for x in abs_str if len(x) > 1] if len(indices) > 2: sent1 = ' '.join(doc[:indices[0]]).replace("@@SS@@", "") sent2 = ' '.join(doc[indices[0]:indices[1]]).replace("@@SS@@", "") sent3 = ' '.join(doc[indices[1]:indices[2]]).replace("@@SS@@", "") lead3 = [sent1, sent2, sent3] else: lead3 = ' '.join(doc).replace("@@SS@@", "\n").split("\n") lead3 = [x for x in lead3 if len(x) > 1] pred_str_bag.append(lead3) ref_str_bag.append([abs_str]) print('Finish reading') rouge = Pythonrouge(summary_file_exist=False, summary=pred_str_bag, reference=ref_str_bag, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, ROUGE_W=True, ROUGE_W_Weight=1.2, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5, default_conf=True) score = rouge.calc_score() print(score)
def compute_rouge(candidate: typing.List[int]) -> float: hypothesis = [doc_sents[sid] for sid in sorted(candidate)] rouge = Pythonrouge( summary_file_exist=False, summary=[hypothesis], reference=[[ref_sents]], stemming=False, ROUGE_SU4=False) score = rouge.calc_score() return score['ROUGE-1-F']
def evaluate(gen_summary, ref_summary, genref_summaries): references = [] ref_subbed_sentences = re.sub(r'(@)', r'_\1_', ref_summary) ref_sentences = nltk.word_tokenize(ref_subbed_sentences) references.append(ref_sentences) for summary in genref_summaries: ref_subbed_sentences = re.sub(r'(@)', r'_\1_', summary) ref_sentences = nltk.word_tokenize(ref_subbed_sentences) references.append(ref_sentences) gen_subbed_sentences = re.sub(r'(@)', r'_\1_', gen_summary) gen_sentences = nltk.word_tokenize(gen_subbed_sentences) bleu_score = corpus_bleu([references], [gen_sentences], weights=(1, 0)) rouge = Pythonrouge(summary_file_exist=False, summary=[gen_sentences], reference=[references], n_gram=1, ROUGE_SU4=True, ROUGE_L=False, recall_only=False, stemming=True, stopwords=True, word_level=True, use_cf=False, cf=95, scoring_formula='best', resampling=True, samples=1000, favor=True, p=0.5) rouge_score = rouge.calc_score() return bleu_score, rouge_score
def RougeEvaluation(refFile, summary_sentences_list): file_open = open(refFile, "r") gold_standard = tokenizer.tokenize(file_open.read()) rouge = Pythonrouge(summary_file_exist=False, summary=[summary_sentences_list], reference=[[gold_standard]], n_gram=2, ROUGE_SU4=False, ROUGE_L=True, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=True, length=5000, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(score) print '\n'
def sentence_rouge(reflex, genlex): rouge = Pythonrouge(n_gram=2, ROUGE_SU4=True, ROUGE_L=True, stemming=True, stopwords=True, word_level=True, length_limit=True, \ length=50, use_cf=False, cf=95, scoring_formula="average", resampling=True, samples=1000, favor=True, p=0.5) genlex = [[genlex,]] reflex = [[[reflex,]]] setting_file = rouge.setting(files=False, summary=genlex, reference=reflex) result = rouge.eval_rouge(setting_file, recall_only=False, ROUGE_path=ROUGE_path, data_path=data_path) return result['ROUGE-L-F']
def similarity_rouge(self, s1, s2): rouge = Pythonrouge(summary_file_exist=False, summary=s1, reference=s2, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=False, stopwords=False, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) ROUGE_score = rouge.calc_score() return list(ROUGE_score.values())[2]
def eval_indiv_rouge(hyp_dir, ref_dir): print(hyp_dir, 'vs', ref_dir) # create tmp dirs with common cases hyp_set = set(os.listdir(hyp_dir)) ref_set = set(os.listdir(ref_dir)) common_set = hyp_set.intersection(ref_set) scores = collections.defaultdict(list) for casefile in common_set: tmp_hyp = './tmp_hyp' tmp_ref = './tmp_ref' os.mkdir(tmp_hyp) os.mkdir(tmp_ref) # copy to tmp dirs shutil.copyfile(os.path.join(hyp_dir, casefile), os.path.join(tmp_hyp, casefile)) shutil.copyfile(os.path.join(ref_dir, casefile), os.path.join(tmp_ref, casefile)) assert os.listdir(tmp_hyp) == os.listdir(tmp_ref) rouge = Pythonrouge(summary_file_exist=True, peer_path=tmp_hyp, model_path=tmp_ref, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(casefile, score) for key in score: scores[key].append(score[key]) shutil.rmtree(tmp_hyp) shutil.rmtree(tmp_ref) print([ "%s: mean %.3f std %.3f" % (key, round(sum(scores[key]) / len(scores[key]), 3), round(statistics.stdev(scores[key]), 3)) for key in scores ])
def rouge_sent(predict_path, golden_path): import spacy nlp = spacy.load('en_core_web_sm') with open(predict_path, 'r') as rf: predict_summary = rf.read().replace('\n', ' ') with open(golden_path, 'r') as rf: golden_summary = rf.read().replace('\n', ' ') doc = nlp(predict_summary) predict_sent_list = [sent.text for sent in doc.sents] doc = nlp(golden_summary) golden_sent_list = [sent.text for sent in doc.sents] max_rouge_1 = -99999 max_rouge_2 = -99999 for p_idx, predict_sent in enumerate(predict_sent_list): for g_idx, golden_sent in enumerate(golden_sent_list): # initialize setting of ROUGE to eval ROUGE-1, 2, SU4 # if you evaluate ROUGE by sentence list as above, set summary_file_exist=False # if recall_only=True, you can get recall scores of ROUGE rouge = Pythonrouge(summary_file_exist=False, summary=[[predict_sent]], reference=[[[golden_sent]]], n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() if score['ROUGE-1'] > max_rouge_1: max_rouge_1 = score['ROUGE-1'] p_idx_rouge_1_save = p_idx g_idx_rouge_1_save = g_idx if score['ROUGE-2'] > max_rouge_2: max_rouge_2 = score['ROUGE-2'] p_idx_rouge_2_save = p_idx g_idx_rouge_2_save = g_idx print("Max ROUGE-1 score among sentences: %.5f" % max_rouge_1) print("[Predicted Sentence]\n%s" % predict_sent_list[p_idx_rouge_1_save]) print("[Golden Sentence]\n%s" % golden_sent_list[g_idx_rouge_1_save]) print() print("Max ROUGE-2 score among sentences: %.5f" % max_rouge_2) print("[Predicted Sentence]\n%s" % predict_sent_list[p_idx_rouge_2_save]) print("[Golden Sentence]\n%s" % golden_sent_list[g_idx_rouge_2_save])
def calculate_rouge_scores(summaries, references, max_length, root=None, global_step=None): # command to install pythonrouge: pip install git+https://github.com/tagucci/pythonrouge.git from pythonrouge.pythonrouge import Pythonrouge logging.info('calculate ROUGE scores of %d summaries', len(summaries)) rouge = Pythonrouge(summary_file_exist=False, summary=summaries, reference=references, n_gram=2, ROUGE_SU4=False, ROUGE_L=True, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=max_length is not None, length=max_length, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() logging.info('ROUGE(1/2/L) Scores:') logging.info('> ROUGE-1-R/F1: %f / %f', score['ROUGE-1-R'], score['ROUGE-1-F']) logging.info('> ROUGE-2-R/F1: %f / %f', score['ROUGE-2-R'], score['ROUGE-2-F']) logging.info('> ROUGE-L-R/F1: %f / %f', score['ROUGE-L-R'], score['ROUGE-L-F']) avg_token_count = sum( len(' '.join(summary).split()) for summary in summaries) / len(summaries) avg_token_count_ref = sum( len(' '.join(summary[0]).split()) for summary in references) / len(references) logging.info('> averageToken: %f / %f', avg_token_count, avg_token_count_ref) if root is not None and global_step is not None: for key in ['ROUGE-1-F', 'ROUGE-2-F']: swriter = tf.summary.FileWriter(os.path.join(root, key)) summary = tf.Summary(value=[ tf.Summary.Value(tag='ROUGE(F1)', simple_value=score[key]) ]) swriter.add_summary(summary, global_step) swriter.close()
def get_rouge(input_sentences, summary, references_dir, order=None, verbose=0): """ Calculate ROUGE scores for generated summaries and references. :param input_sentences: unmodified input sentences to calculate extractivness :param summary: list of summaries :param references_dir: contains reference files :param order: order during loading input :param verbose: verbosity :return: ROUGE scores """ if order is None: order = range(len(summary)) reference_filenames = os.listdir(references_dir) references_all = [[] for _ in reference_filenames] for i, reference_filename in enumerate(reference_filenames): with open(os.path.join(references_dir, reference_filename)) as f: for line in f: references_all[i].append([line.rstrip()]) references = [[references_list[o] for references_list in references_all] for o in order] if verbose > 0: for i, s, r_list in zip(input_sentences, summary, references): print('input, generated sentence and references:') print('{}'.format(i)) print('{}'.format(s)) for r in r_list: print('{}'.format(r)) print('') rouge = Pythonrouge(summary_file_exist=False, summary=summary, reference=references, n_gram=2, ROUGE_SU4=False, ROUGE_L=True, recall_only=True, stemming=True, stopwords=False, word_level=False, length_limit=True, length=75, use_cf=True, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) scores = rouge.calc_score() logger.info('ROUGE-1: {ROUGE-1:.4f} ROUGE-2: {ROUGE-2:.4f} ROUGE-L: {ROUGE-L:.4f}'.format(**scores)) words_total = 0 words_ext = 0 for i, s in zip(input_sentences, summary): words_total += len(set(s[0].split(' '))) words_ext += len(set(s[0].split(' ')) & set(i.split(' '))) logger.info('{0:.2f}% extractive'.format(words_ext / float(words_total))) plt.hist([len(s[0]) for s in summary], bins=30) plt.xlabel('output characters', fontsize=11) plt.savefig('plot.png') return scores
def eval_rouge(hyp_dir, ref_dir): print(hyp_dir, 'vs', ref_dir) # create tmp dirs with common cases hyp_set = set(os.listdir(hyp_dir)) ref_set = set(os.listdir(ref_dir)) common_set = hyp_set.intersection(ref_set) # print(len(hyp_set)) # print(len(ref_set)) # print(len(common_set)) tmp_hyp = './tmp_hyp' tmp_ref = './tmp_ref' if not os.path.isdir(tmp_hyp): os.mkdir(tmp_hyp) if not os.path.isdir(tmp_ref): os.mkdir(tmp_ref) # copy to tmp dirs for casefile in common_set: shutil.copyfile(os.path.join(hyp_dir, casefile), os.path.join(tmp_hyp, casefile)) shutil.copyfile(os.path.join(ref_dir, casefile), os.path.join(tmp_ref, casefile)) assert os.listdir(tmp_hyp) == os.listdir(tmp_ref) rouge = Pythonrouge(summary_file_exist=True, peer_path=tmp_hyp, model_path=tmp_ref, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(score) shutil.rmtree(tmp_hyp) shutil.rmtree(tmp_ref)
def rouge_sent(predict_summary, golden_summary): import spacy nlp = spacy.load('en_core_web_sm') predict_summary = predict_summary.replace('\n', ' ') golden_summary = golden_summary.replace('\n', ' ') doc = nlp(predict_summary) predict_sent_list = [sent.text for sent in doc.sents] doc = nlp(golden_summary) golden_sent_list = [sent.text for sent in doc.sents] max_rouge_1 = -99999 max_rouge_2 = -99999 for p_idx, predict_sent in enumerate(predict_sent_list): for g_idx, golden_sent in enumerate(golden_sent_list): # initialize setting of ROUGE to eval ROUGE-1, 2, SU4 # if you evaluate ROUGE by sentence list as above, set summary_file_exist=False # if recall_only=True, you can get recall scores of ROUGE rouge = Pythonrouge(summary_file_exist=False, summary=[[predict_sent]], reference=[[[golden_sent]]], n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() if score['ROUGE-1'] > max_rouge_1: max_rouge_1 = score['ROUGE-1'] if score['ROUGE-2'] > max_rouge_2: max_rouge_2 = score['ROUGE-2'] return {"ROUGE-1": max_rouge_1, "ROUGE-2": max_rouge_2}
def eval_str(cs, ref): sci_text = str(cs).replace("\n", " ") gold_text = str(ref).replace("\n", " ") ref_summary = gold_text ref_bleu = [] ref_bleu.append(gold_text.split(" ")) reference = [] reference.append([[gold_text]]) cs_bleu = sci_text.split(" ") b = [] b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(1, 0, 0, 0))) #1 gram b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 1, 0, 0))) #2 gram b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 0, 1, 0))) #3 gram b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 0, 0, 1))) #4 gram answer = [] answer.append([sci_text]) r = Pythonrouge(summary_file_exist=False, summary=answer, reference=reference, n_gram=2, ROUGE_SU4=False, ROUGE_L=True, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=True, length=600, use_cf=False, cf=95, scoring_formula='best', resampling=True, samples=1, favor=True, p=0.5) score = r.calc_score() return b[0], b[1], b[2], b[3], score["ROUGE-1-P"], score[ "ROUGE-1-R"], score["ROUGE-1-F"], score["ROUGE-2-P"], score[ "ROUGE-2-R"], score["ROUGE-2-F"], score["ROUGE-L-P"], score[ "ROUGE-L-R"], score["ROUGE-L-F"]
def __init__(self, csv_file, type_reward): self.csv_file = csv_file self.type_reward = type_reward # \in {rouge-1, rouge-2, rouge-l, rouge-avg} # self.evaluator = Pythonrouge(summary_file_exist=False, delete_xml=True, summary=[], reference=[], n_gram=2, ROUGE_SU4=False, ROUGE_L=True, f_measure_only=True, stemming=True, stopwords=False, word_level=True, length_limit=False)
def rouge_para(predict_path, golden_path): # initialize setting of ROUGE, eval ROUGE-1, 2, SU4 # if summary_file_exis=True, you should specify predict summary(peer_path) and golden summary(model_path) paths rouge = Pythonrouge(summary_file_exist=True, peer_path=predict_path, model_path=golden_path, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(score)
def evaluate_rouge_scores(evaluation_file_name): summaries = [] # model-generated references = [] # human-generated # articles = {} with gzip.open(evaluation_file_name) as json_file: json_data = json_file.read() data = json.loads(json_data) print("%d entries..." % len(data)) for example in data: # datum = example['data'] # if not datum in articles: # articles[datum] = True summaries.append( remove_tags(example['prediction']).encode('utf-8').split()) references.append([ remove_tags(example).encode('utf-8').split() for example in example['label'] ]) print("%d entries are used for evaluation." % len(summaries)) # DEBUG: print a couple examples and their respective ROUGE scores # print(zip(summaries[5:10], references[5:10])) # rouge = Pythonrouge(n_gram=2, ROUGE_SU4=False, ROUGE_L=True, stemming=False, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=True, cf=95, scoring_formula="average", resampling=False, samples=500, favor=False, p=0.5) # setting_file = rouge.setting(files=False, summary=summaries[5:10], reference=references[5:10]) # print(rouge.eval_rouge(setting_file, recall_only=False, ROUGE_path=ROUGE_PATH, data_path=ROUGE_DATA, f_measure_only=False)) rouge = Pythonrouge(n_gram=2, ROUGE_SU4=False, ROUGE_L=True, stemming=False, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=True, cf=95, scoring_formula="average", resampling=False, samples=500, favor=False, p=0.5) setting_file = rouge.setting(files=False, summary=summaries, reference=references) result = rouge.eval_rouge(setting_file, recall_only=False, ROUGE_path=ROUGE_PATH, data_path=ROUGE_DATA, f_measure_only=False) return result
def rouge(hyp, ref, n=None): # 1 - 4, L hyp, ref = " ".join(hyp), " ".join(ref) ret = Pythonrouge(summary_file_exist=False, summary=[[hyp]], reference=[[[ref]]], n_gram=4, ROUGE_SU4=True, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5).calc_score() if n is None: return ret else: return ret["rouge-" + n]["f"]
def compute_perl_rouge(hyp_list, ref_list): def clean(x): return re.sub(r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''", lambda m: perl_remap.get(m.group()), x) def preprocess(doc_list, reference=False): doc_list_pre = [] if reference: doc_list = doc_list[0] for sent_str in doc_list: sent_str = clean(sent_str.lower()) doc_list_pre.append(sent_str) if reference: doc_list_pre = [doc_list_pre] return doc_list_pre rouge = Pythonrouge( summary_file_exist=False, summary=[preprocess(hyp) for hyp in hyp_list], reference=[preprocess(ref, reference=True) for ref in ref_list], n_gram=2, ROUGE_SU4=False, ROUGE_L=True, ROUGE_W=False, ROUGE_W_Weight=1.2, recall_only=False, f_measure_only=False, stemming=True, stopwords=False, word_level=False, length_limit=False, length=50, use_cf=True, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=False, p=0.5, ) scores = rouge.calc_score() rouge_1_f = round(scores['ROUGE-1-F'] * 100., 2) rouge_2_f = round(scores['ROUGE-2-F'] * 100., 2) rouge_l_f = round(scores['ROUGE-L-F'] * 100., 2) return (rouge_1_f, rouge_2_f, rouge_l_f)
def oracle_baseline(path_file): with open(path_file, 'r') as fd: lines = fd.read().splitlines() pred_str_bag, ref_str_bag = [], [] for l in lines: name, doc, abst, span_info, gold = l.split('\t') doc = doc.split() span_info = [int(w) for w in span_info.split()] idx_in_span = list(zip(span_info[0::2], span_info[1::2])) gold_label = [int(l) for l in gold.split()] abs_str = abst.replace("@@SS@@", "\n").split("\n") abs_str = [x for x in abs_str if len(x) > 1] _buff = [] for g in gold_label: content = doc[idx_in_span[g][0]:idx_in_span[g][1] + 1] _buff.append(' '.join(content).replace("@@SS@@", "")) pred_str_bag.append(_buff) ref_str_bag.append([abs_str]) print('Finish reading') rouge = Pythonrouge(summary_file_exist=False, summary=pred_str_bag, reference=ref_str_bag, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, ROUGE_W=True, ROUGE_W_Weight=1.2, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5, default_conf=True) score = rouge.calc_score() print(score)
def liPaperEvaluation(pred_y, true_y): rouge = Pythonrouge(summary_file_exist=False, summary=pred_y, reference=true_y, f_measure_only=True, n_gram=2, ROUGE_SU4=False, ROUGE_L=True, stemming=False, stopwords=False, word_level=False, length_limit=False, use_cf=False, cf=95, scoring_formula="average", resampling=True, samples=1000) result = rouge.calc_score() print(result)
def __get_score(self,predicted_summ,gold_summ,**kwargs): summary=gold_summ reference=predicted_summ self.rouge = Pythonrouge(summary_file_exist=False, summary=summary, reference=reference, n_gram=3, ROUGE_SU4=False, ROUGE_L=True, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) self.rouge.summary = gold_summ self.rouge.reference = predicted_summ score = self.rouge.calc_score() score=score['ROUGE-1'] + score['ROUGE-2']*5 + score['ROUGE-3']*2 +score['ROUGE-L']*2 config = kwargs['config'] score = kwargs['prev_score']- score if (score > config.dqn_options.ERROR_THRESH): return score return -1
def evaluate(system_summary, reference_summaries, stemming=False, stopwords=False, use_cf=False, ngram=2): ROUGE_path = "rouge_files/ROUGE-1.5.5/ROUGE-1.5.5.pl" data_path = "rouge_files/ROUGE-1.5.5/data/" # initialize setting of ROUGE, eval ROUGE-1, 2 rouge = Pythonrouge(n_gram=ngram, ROUGE_SU4=False, ROUGE_L=False, stemming=stemming, stopwords=stopwords, word_level=True, length_limit=True, length=100, use_cf=use_cf, cf=95, scoring_formula="average", resampling=True, samples=1000, favor=True, p=0.5) # system summary: list of summaries, where each summary is a list of sentences summary = [system_summary] # reference summaries: list of (list of summaries per article), where each summary is a list of sentences reference = [[[summary] for summary in reference_summaries]] setting_file = rouge.setting(files=False, summary=summary, reference=reference, temp_root='') result = rouge.eval_rouge(setting_file, ROUGE_path=ROUGE_path, data_path=data_path) return result
def rouge_protocol(list_of_pred, list_of_reference): """ # summary: double list summary = [[summaryA_sent1, summaryA_sent2], [summaryB_sent1, summaryB_sent2]] # reference: triple list reference = [[[summaryA_ref1_sent1, summaryA_ref1_sent2], [summaryA_ref2_sent1, summaryA_ref2_sent2]], [[summaryB_ref1_sent1, summaryB_ref1_sent2], [summaryB_ref2_sent1, summaryB_ref2_sent2]] :param list_of_pred: [[]] :param list_of_reference:[[[]]] :return: """ # print(list_of_pred, list_of_reference) if (not isinstance(list_of_pred, List)) or (not isinstance( list_of_reference, List)): raise TypeError("Input should be list.") rouge = Pythonrouge(summary_file_exist=False, summary=list_of_pred, reference=list_of_reference, n_gram=2, ROUGE_SU4=True, ROUGE_L=True, ROUGE_W=True, ROUGE_W_Weight=1.2, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5, default_conf=True) score = rouge.calc_score() return score
def eval_summaries(summaries, docs, logger=None, encoding='utf-8', delete_temps=True): if logger is None: logger = logging.getLogger(__name__) references = [] hypotheses = [] for summary, doc in zip(summaries, docs): refs = [[' '.join(sent) for sent in doc.summary]] hyp = [' '.join(doc.sentences[idx].words) for idx in summary] references.append(refs) hypotheses.append(hyp) assert len(references) == len(hypotheses), 'Number of references and hypotheses mismatch' ref_dirname = tempfile.mkdtemp() logger.info('References directory: %s', ref_dirname) hyp_dirname = tempfile.mkdtemp() logger.info('Hypotheses directory: %s', hyp_dirname) for doc_id, (refs, hyp) in enumerate(zip(references, hypotheses)): # Write references for rid, ref in enumerate(refs): ref_filename = os.path.join(ref_dirname, f'{doc_id}.{rid}.txt') with open(ref_filename, 'w', encoding=encoding) as f: print('\n'.join(ref), file=f) # Write hypothesis hyp_filename = os.path.join(hyp_dirname, f'{doc_id}.txt') with open(hyp_filename, 'w', encoding=encoding) as f: print('\n'.join(hyp), file=f) rouge = Pythonrouge( peer_path=hyp_dirname, model_path=ref_dirname, stemming=False, ROUGE_L=True, ROUGE_SU4=False) score = rouge.calc_score() logger.info('ROUGE scores: %s', score) if delete_temps: logger.info('Deleting temporary files and directories') shutil.rmtree(ref_dirname) shutil.rmtree(hyp_dirname) return score
def _calculate_rouge_scores(self, summaries, references, max_length=None): # command to install pythonrouge: pip install git+https://github.com/tagucci/pythonrouge.git from pythonrouge.pythonrouge import Pythonrouge logging.info('calculate ROUGE scores of %d summaries', len(summaries)) rouge = Pythonrouge(summary_file_exist=False, summary=summaries, reference=references, n_gram=2, ROUGE_SU4=False, ROUGE_L=True, recall_only=False, stemming=True, stopwords=False, word_level=True, length_limit=max_length is not None, length=max_length, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) scores = rouge.calc_score() logging.info('ROUGE(1/2/L) Scores:') logging.info('> ROUGE-1-R/F1: %f / %f', scores['ROUGE-1-R'], scores['ROUGE-1-F']) logging.info('> ROUGE-2-R/F1: %f / %f', scores['ROUGE-2-R'], scores['ROUGE-2-F']) logging.info('> ROUGE-L-R/F1: %f / %f', scores['ROUGE-L-R'], scores['ROUGE-L-F']) avg_token_count = sum( len(' '.join(summary).split()) for summary in summaries) / len(summaries) avg_token_count_ref = sum( len(' '.join(summary[0]).split()) for summary in references) / len(references) logging.info('> averageToken: %f / %f', avg_token_count, avg_token_count_ref) return scores
def calc_item_rouge(item): """ 计算单个rouge :return: """ para = item['data'] # print(para) labels = item['label'] pattern = '<s>([^<]*)</s>' sentences = re.findall(pattern, para) # print(len(sentences)) reference = [re.findall(pattern, i)[0] for i in labels] # print(len(labels)) all_sentences = sentences + reference ref = [[reference]] res = [] for i in all_sentences: summary = [[i]] rouge = Pythonrouge(summary_file_exist=False, summary=summary, reference=ref, n_gram=2, ROUGE_SU4=True, ROUGE_L=False, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(i, score) res.append((i, score)) return res
def evaluation(model,data_loader,word_to_idx,idx_to_word): summary = [] reference = [] count = 0 for data in data_loader: reference.append([]) reference[count].append(data[1]) output = sample(model,data[0],data[2],word_to_idx,idx_to_word) summary.append([output]) count +=1 print(output) rouge = Pythonrouge(summary_file_exist=False, summary=summary, reference=reference, n_gram=2, ROUGE_SU4=True, ROUGE_L=False, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(score)
def compute_perl_scores(reference, summary, stem, remove_stop): rouge = Pythonrouge(summary_file_exist=False, summary=[[summary]], reference=[[[reference]]], n_gram=3, ROUGE_SU4=False, ROUGE_L=True, recall_only=False, stemming=stem, stopwords=remove_stop, word_level=True, length_limit=False, length=150, use_cf=False, cf=95, scoring_formula='average', resampling=False, samples=1000, favor=True, p=0.5) return rouge.calc_score()