def generation_metrics(hypothesis_list, reference_list): rouge = Rouge() rouge_scores = rouge.get_scores( hyps=list(reduce(lambda x,y:x+y,list(zip(*([hypothesis_list]*len(reference_list)))))), refs=reference_list * len(hypothesis_list), avg=True) bleu_scores = {} for i in range(1,6): bleu_scores['bleu%s'%i] = bleu_score.corpus_bleu( list_of_references=[reference_list]*len(hypothesis_list), hypotheses=hypothesis_list, weights=[1.0/i]*i) return rouge_scores, bleu_scores
def rouge_eval(hyps, refer): rouge = Rouge() # print(hyps) # print(refer) # print(rouge.get_scores(hyps, refer)) try: score = rouge.get_scores(hyps, refer)[0] mean_score = np.mean([ score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"] ]) except: mean_score = 0.0 return mean_score
def get_rouge_scores(generated_summary, ref_summary): rouge = Rouge() r_scores = rouge.get_scores(generated_summary, ref_summary) scores = {} for rouge, outputs in r_scores[0].items(): temp_output = {} for output, value in outputs.items(): temp_output[output] = round(value, 3) scores[rouge] = temp_output return scores
def score(ref, hypo): scorer = Rouge() method = "ROUGE_L" final_scores = {} score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(score, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def predict(self, test_data, beam_search=True): """ :param test_data: test data loader :param beam_search: whether use beam search, otherwise use greedy search :return: """ rouge = Rouge() f1_r1, f1_r2, f1_rl = [], [], [] print("生成摘要...") sleep(0.5) self.model.eval() count = 0 with torch.no_grad(): # batch_sz = 1 for batch_data in tqdm(test_data): # (batch_sz, src_len), (batch_sz, src_len, 1), (batch_sz, tgt_len) src_vec, src_mask, tgt_vec = [ data.to(self.device) for data in batch_data[:-1] ] oov_dict = batch_data[-1] # (batch_sz, src_len, 2*hidden_sz), (batch_sz, 2*hidden_sz) enc_output, enc_hidden = self.model.encoder( replace_oov(src_vec, self.vocab, self.device)) # (2, batch_sz, hidden_sz) -> (1, batch_sz, hidden_sz) context_vec = self.model.reducer(enc_hidden) # (batch_sz, ) dec_input = tgt_vec[:, 0].to(self.device) # (batch_sz, src_len, 1) coverage_vec = torch.zeros_like(src_vec, dtype=torch.float).unsqueeze(2) if beam_search: summary = self.beam_search(dec_input, context_vec, coverage_vec, enc_output, src_mask, src_vec, oov_dict, self.beam_width) else: summary = self.beam_search(dec_input, context_vec, coverage_vec, enc_output, src_mask, src_vec, oov_dict, 1) sentence = id2output(summary, self.vocab, oov_dict[0]) ref = id2output(tgt_vec.squeeze().cpu().tolist(), self.vocab, oov_dict[0]) score = rouge.get_scores(sentence, ref)[0] f1_r1.append(score['rouge-1']['f']) f1_r2.append(score['rouge-2']['f']) f1_rl.append(score['rouge-l']['f']) if (count + 1) % 100 == 0: print('\r', ref) print(sentence, '\n') count += 1 return map(np.mean, (f1_r1, f1_r2, f1_rl))
def _eval_epoch(sess, mode, epoch_no): """ This function is the same as _eval_epoch() in baseline_seq2seq_attn_main.py. """ if mode == 'val': data_iterator.switch_to_val_data(sess) else: data_iterator.switch_to_test_data(sess) refs, hypos = [], [] while True: try: fetches = [ batch['target_text'][:, 1:], infer_outputs.predicted_ids[:, :, 0] ] feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} target_texts_ori, output_ids = \ sess.run(fetches, feed_dict=feed_dict) target_texts = tx.utils.strip_special_tokens( target_texts_ori.tolist(), is_token_list=True) target_texts = tx.utils.str_join(target_texts) output_texts = tx.utils.map_ids_to_strs( ids=output_ids, vocab=val_data.target_vocab) tx.utils.write_paired_text(target_texts, output_texts, log_dir + mode + '_results' + str(epoch_no) + '.txt', append=True, mode='h', sep=' ||| ') for hypo, ref in zip(output_texts, target_texts): if config_data.eval_metric == 'bleu': hypos.append(hypo) refs.append([ref]) elif config_data.eval_metric == 'rouge': hypos.append(tx.utils.compat_as_text(hypo)) refs.append(tx.utils.compat_as_text(ref)) except tf.errors.OutOfRangeError: break if config_data.eval_metric == 'bleu': return tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) elif config_data.eval_metric == 'rouge': rouge = Rouge() return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
def main(): hyp = 'hyp' raw_ref = 'abstracts' FJoin = os.path.join files_hyp = [FJoin(hyp, f) for f in os.listdir(hyp)] files_raw_ref = [FJoin(raw_ref, f) for f in os.listdir(hyp)] f_hyp = [] f_raw_ref = [] print("number of document: ", len(files_hyp)) for file in files_hyp: f = open(file) f_hyp.append(f.read()) f.close() for file in files_raw_ref: f = open(file) f_raw_ref.append(f.read()) f.close() rouge_1_tmp = [] rouge_2_tmp = [] rouge_L_tmp = [] number = 1 for hyp, ref in zip(f_hyp, f_raw_ref): try: rouge = Rouge() scores = rouge.get_scores(hyp, ref, avg=True) # rouge_1 = scores["rouge-1"]["r"] # rouge_2 = scores["rouge-2"]["r"] # rouge_L = scores["rouge-l"]["r"] rouge_1 = scores["rouge-1"]["f"] rouge_2 = scores["rouge-2"]["f"] rouge_L = scores["rouge-l"]["f"] rouge_1_tmp.append(rouge_1) rouge_2_tmp.append(rouge_2) rouge_L_tmp.append(rouge_L) print(scores) except Exception: pass rouge_1_avg = sta.mean(rouge_1_tmp) rouge_2_avg = sta.mean(rouge_2_tmp) rouge_L_avg = sta.mean(rouge_L_tmp) print("total file : " , len(f_hyp)) print(len(rouge_1_tmp)) print('Rouge-1') print(rouge_1_avg) print('Rouge-2') print(rouge_2_avg) print('Rouge-L') print(rouge_L_avg)
def compute_rouge_python(cand, ref, is_input_files=False, language="en"): """ Computes ROUGE scores using the python package (https://pypi.org/project/py-rouge/). Args: cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings containing predicted summaries. if `is_input_files` is `True`, `cand` is the path to the file containing the predicted summaries. ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings containing reference summaries. if `is_input_files` is `True`, `cand` is the path to the file containing the reference summaries. is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are lists of predicted and reference summaries. Defaults to False. language (str, optional): Language of the input text. Supported values are "en" and "hi". Defaults to "en". Returns: dict: Dictionary of ROUGE scores. """ supported_langauges = ["en", "hi"] if language not in supported_langauges: raise Exception( "Language {0} is not supported. Supported languages are: {1}.". format(language, supported_langauges)) if is_input_files: candidates = [line.strip() for line in open(cand, encoding="utf-8")] references = [line.strip() for line in open(ref, encoding="utf-8")] else: candidates = cand references = ref print("Number of candidates: {}".format(len(candidates))) print("Number of references: {}".format(len(references))) assert len(candidates) == len(references) if language == "en": evaluator = Rouge() else: evaluator = RougeExt( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True, language=language, ) scores = evaluator.get_scores(candidates, references) return scores
def evaluate(model, data, model_name='trs', ty='valid', verbose=True): hyp_g, ref, r1, r2, rl, r_avg = [], [], [], [], [], [] t = Translator(model) rouge = Rouge() l, loss = [], None pbar = tqdm(enumerate(data), total=len(data)) for j, batch in pbar: if ty != "test": loss = model.train_one_batch(batch, train=False) l.append(loss.item()) if ((j <= 1 and ty != "test") or ty == "test"): if ty != 'test': sent_g = model.decoder_greedy( batch) # 1-decoder generation. for testing else: sent_g = model.eval_one_batch(batch) # 2-decoder generation. # sent_b, _ = t.translate_batch(batch) # beam search for i, sent in enumerate(sent_g): hyp_g.append(sent) ref.append(batch["target_txt"][i]) rouges = rouge.get_scores( sent, batch["target_txt"][i])[0] # (hyp, ref) r1_val, r2_val, rl_val = rouges['rouge-1']["f"], rouges[ 'rouge-2']["f"], rouges['rouge-l']["f"] r1.append(r1_val) r2.append(r2_val) rl.append(rl_val) r_avg.append(np.mean([r1_val, r2_val, rl_val])) pbar.set_description("EVAL loss:{:.4f} r_avg:{:.2f}".format( np.mean(l), np.mean(r_avg))) if (j > 1 and ty == "train"): break if l: loss = np.mean(l) r_avg = np.mean(r_avg) r1 = np.mean(r1) r2 = np.mean(r2) rl = np.mean(rl) if (verbose): print( "\nEVAL loss: {:.4f} r_avg: [{:.2f}] r1: {:.2f} r2: {:.2f} rl: {:.2f}" .format(loss, r_avg, r1, r2, rl)) for hyp, gold in zip(hyp_g, ref): print("HYP: ") print(hyp) print("GOLD: ") print(gold) return loss, r_avg
def eval(gt_text, arg_text, non_arg_text=None): if non_arg_text: length_arg = len(arg_text) length_no_arg = len(non_arg_text) fpr_values = [] for arg_length in [220, 330, 440]: ratio_arg = arg_length / length_arg ratio_no_arg = (660 - arg_length) / length_no_arg if ratio_arg > 0.3: summary_arg = extractive_summary(arg_text, min(ratio_arg, 1)) else: summary_arg = extractive_summary(arg_text, ratio_arg, 20, 200) summary_no_arg = extractive_summary(non_arg_text, ratio_no_arg, 20, 200) summary = summary_no_arg + summary_arg rouge = Rouge() score = rouge.get_scores(summary, gt_text) print(summary) print(score[0]['rouge-1']) sco = score[0]['rouge-1'] fpr_values.append(sco['f']) fpr_values.append(sco['p']) fpr_values.append(sco['r']) return fpr_values else: summary = arg_text #length = len(arg_text) #ratio = 665/length #if ratio > 0.3: # summary = extractive_summary(arg_text, min(ratio,1)) #else: # summary = extractive_summary(arg_text, ratio, 20, 200) print(summary) rouge = Rouge() score = rouge.get_scores(summary, gt_text) print(score) sco = score[0]['rouge-1'] return sco['f'], sco['p'], sco['r']
def get_rouge_score(result_rouge_list): # 0:输入句子,1 表示输出句子 2-11表示生成句子 12 表示 truth nll 一般都取f rouge_1_f = 0.0 rouge_2_f = 0.0 rouge_l_f = 0.0 rouge = Rouge() for line in result_rouge_list: rouge_score = rouge.get_scores(hyps=line[2], refs=line[1]) rouge_1_f += rouge_score[0]["rouge-1"]['f'] rouge_2_f += rouge_score[0]["rouge-2"]['f'] rouge_l_f += rouge_score[0]["rouge-l"]['f'] print("rouge_1_f : ", rouge_1_f / len(result_rouge_list)) print("rouge_2_f : ", rouge_2_f / len(result_rouge_list)) print("rouge_l_f : ", rouge_l_f / len(result_rouge_list))
def __init__(self, model, test_data, key, batch=100, device=-1, max_length=100): self.model = model self.test_data = test_data self.key = key self.batch = batch self.device = device self.max_length = max_length self.rouge = Rouge()
def calculate_rouge(hypothesis, reference): """eval methods 支援list[str]格式""" rouge = Rouge() # print(f"hypothesis: {hypothesis}") # print(f"reference: {reference}") hypothesis = [" ".join(h) for h in hypothesis] reference = [" ".join(r) for r in reference] # hypothesis = ["the #### transcript is a written version of each day's cnn student news program use this transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news"] # reference = ["this page includes the show transcript use the transcript to help students with reading comprehension andvocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teacher or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news"] scores = rouge.get_scores(hypothesis, reference, avg=True) return scores
def main(): parser = argparse.ArgumentParser(description='Rouge Metric Calculator') parser.add_argument('-f', '--file', help="File mode", action='store_true') parser.add_argument('-a', '--avg', help="Average mode", action='store_true') parser.add_argument('--ignore_empty', action='store_true', help="Ignore empty hypothesis") parser.add_argument('hypothesis', type=str, help='Text of file path') parser.add_argument('reference', type=str, help='Text or file path') parser.add_argument("--metrics", nargs="+", type=str.upper, choices=METRICS_CHOICES.keys(), help="Metrics to use (default=all)") parser.add_argument("--stats", nargs="+", type=str.upper, choices=STATS_CHOICES, help="Stats to use (default=all)") args = parser.parse_args() metrics = args.metrics stats = args.stats if metrics is not None: metrics = [METRICS_CHOICES[m] for m in args.metrics] if args.file: hyp, ref = args.hypothesis, args.reference assert (os.path.isfile(hyp)) assert (os.path.isfile(ref)) files_rouge = FilesRouge(hyp, ref, metrics, stats) scores = files_rouge.get_scores(avg=args.avg, ignore_empty=args.ignore_empty) print(json.dumps(scores, indent=2)) else: hyp, ref = args.hypothesis, args.reference assert (type(hyp) == str) assert (type(ref) == str) rouge = Rouge(metrics, stats) scores = rouge.get_scores(hyp, ref, avg=args.avg) print(json.dumps(scores, indent=2))
def __init__(self, embedding_sz=5): self.encoder_model = SentenceTransformer('bert-base-nli-mean-tokens') self.rouge = Rouge() self.cluster_n = 5 self.embedding_sz = embedding_sz self.kmeans = KMeans(n_clusters=self.cluster_n) self.cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english') self.legal_classes = {} self.stop_words = set(stopwords.words('english')) self.naive_bayes_model = MultinomialNB() self.legal_class_list = []
def test_rouge(temp_dir, cand, ref): candidates = [line.strip() for line in open(cand, encoding="utf-8")] references = [line.strip() for line in open(ref, encoding="utf-8")] assert len(candidates) == len(references) cnt = len(candidates) current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time)) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) os.mkdir(tmp_dir + "/candidate") os.mkdir(tmp_dir + "/reference") try: for i in range(cnt): if len(references[i]) < 1: continue with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(candidates[i]) with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(references[i]) candidates_, references_ = [], [] for cand, ref in zip(candidates, references): if len(cand) * len(ref) != 0: cand = " ".join(cand.split("<q>")) ref = " ".join(ref.split("<q>")) candidates_.append(cand) references_.append(ref) # calculate rouge score logger.info( "| Calculating rouge score on {} candidate-reference pairs ...". format(len(candidates_))) rouge = Rouge() rouge_result_dict = rouge.get_scores(candidates_, references_, avg=True) finally: if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) return rouge_result_dict
def eval(model, iter): model.eval() total_loss = 0 rouge_scores = [] loss_list = [] r = Rouge() step = 0 with torch.no_grad(): for i, b in enumerate(iter): a = b.ARTICLE s = b.SUMMARY output, mu, logvar, z = model(a, s) out = output[1:].view(-1, output.shape[-1]) summary = s[1:].view(-1) NLL, KL, w = model.KL_loss(out.float(), mu, logvar, summary, step) step += 1 loss = NLL + w * KL total_loss += loss.item() if i % 10 == 0: # print("Loss at iteration " + str(i) + ": ", loss.item()) loss_list.append(loss.item()) """ if i > 1: preds = convert_to_text(output, model.text.vocab) labels = convert_to_text(s, model.text.vocab, False) preds, labels = detokenize(preds, labels) for i in range(len(preds)): rouge_scores.append([i, preds[i], labels[i], r.get_scores(preds[i], labels[i])]) return total_loss / len(iter), loss_list, rouge_scores """ preds = convert_to_text(output, model.text.vocab) labels = convert_to_text(s, model.text.vocab, False) preds, labels = detokenize(preds, labels) for i in range(len(preds)): rouge_scores.append([i, preds[i], labels[i], r.get_scores(preds[i], labels[i])]) return total_loss/len(iter), loss_list, rouge_scores
def calc_metrics(refs, hyps, metric): print("Count:", len(hyps)) print("Ref:", refs[-1]) print("Hyp:", hyps[-1]) if metric in ("bleu", "all"): from nltk.translate.bleu_score import corpus_bleu print("BLEU: ", corpus_bleu([[r] if r is not list else r for r in refs], hyps)) if metric == "legacy_rouge": print(calc_legacy_rouge(refs, hyps)) if metric in ("rouge", "all"): rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) print("ROUGE: ", scores)
def cnndm_eval(ratio): """Evaluation for the CNN/DailyMail dataset""" for func_name in BASELINES: print("Evaluating \"{}\"".format(func_name)) print("=" * 20) with open(str(RESULTS_DIR / "cnndm_baseline_{}_{}.pred".format( func_name, int(ratio * 100)))) as fin: predictions = fin.read().split("\n")[:-1] with open(str(DATA_DIR / "cnndm" / "test.txt.tgt.tagged")) as fin: references = fin.read().replace("<t> ", "").replace( "</t> ", "").split("\n")[:-1] assert all([len(x) > 0 for x in predictions]) scores = Rouge().get_scores(predictions, references, avg=True) pprint(scores)
def evaluate(val_data, train_data, model, encoder_layer, decoder_layer, n_output_tokens): rouge = Rouge() input_texts, target_texts, input_sequences, target_sequences = train_data train_target_texts, _, train_target_sequences, _ = train_data max_decoder_seq_length = max([len(txt) for txt in train_target_texts]) target_token_index = dict([(char, i) for i, char in enumerate(train_target_sequences)]) reverse_target_token_index = dict( (i, char) for char, i in target_token_index.items() ) X1_test, X2_test, y_test = preprocessing(train_data) # example runs for idx in range(3): input_seq = X1_test[idx : idx + 1] predicted = predict_sequence( encoder_layer, decoder_layer, input_seq, n_output_tokens, reverse_target_token_index, max_decoder_seq_length, ) print("- -- -") # print("Input sentence:", "\n".join(input_texts[idx])) print("Ground truth sentence:", "\n".join(target_texts[idx])) print("Decoded sentence:", predicted) print("- -- -") # evaluate LSTM total = len(X1_test) refs = [] hyps = [] for idx in range(total): input_seq = X1_test[idx : idx + 1] predicted_summ = predict_sequence( encoder_layer, decoder_layer, input_seq, n_output_tokens, reverse_target_token_index, max_decoder_seq_length, ) refs.append("\n".join(input_texts[idx])) hyps.append(predicted_summ) rouge_scores = rouge.get_scores(hyps, refs, avg=True) print("Average ROUGE Score: {}".format(rouge_scores))
def main(): hyp = 'hyp' raw_ref = 'duc2002_summaries_I' FJoin = os.path.join files_hyp = [FJoin(hyp, f) for f in os.listdir(hyp)] files_raw_ref = [FJoin(raw_ref, f) for f in os.listdir(hyp)] f_hyp = [] f_raw_ref = [] print("number of document: ", len(files_hyp)) for file in files_hyp: f = open(file) f_hyp.append(f.read()) f.close() for file in files_raw_ref: f = open(file) f_raw_ref.append(f.read()) f.close() # import pdb # pdb.set_trace() rouge_1_tmp = [] rouge_2_tmp = [] rouge_L_tmp = [] number = 1 for hyp, ref in zip(f_hyp, f_raw_ref): try: rouge = Rouge() scores = rouge.get_scores(hyp, ref, avg=True) rouge_1 = scores["rouge-1"]["r"] rouge_2 = scores["rouge-2"]["r"] rouge_L = scores["rouge-l"]["r"] rouge_1_tmp.append(rouge_1) rouge_2_tmp.append(rouge_2) rouge_L_tmp.append(rouge_L) # import pdb; pdb.set_trace() # print(number) print(scores) except Exception: pass # number +=1 rouge_1_avg = sta.mean(rouge_1_tmp) rouge_2_avg = sta.mean(rouge_2_tmp) rouge_L_avg = sta.mean(rouge_L_tmp) print('Rouge-1') print(rouge_1_avg) print('Rouge-2') print(rouge_2_avg) print('Rouge-L') print(rouge_L_avg)
def _eval_epoch(mode): if mode == 'val': data_iterator.switch_to_val_data() else: data_iterator.switch_to_test_data() model.eval() refs, hypos = [], [] evalStart = 0 for batch in data_iterator: infer_outputs = model(batch, mode="val") output_ids = infer_outputs["sample_id"][:, :, 0].cpu() target_texts_ori = [text[1:] for text in batch['target_text']] target_texts = tx.utils.strip_special_tokens(target_texts_ori, is_token_list=True) output_texts = tx.data.vocabulary.map_ids_to_strs( ids=output_ids, vocab=val_data.target_vocab) if (evalStart == 0): src_words = model.source_embedder(batch['source_text_ids']) print('src wrds', src_words[0, 0:2, 0:10].flatten(), target_texts[0]) evalStart = 1 else: pass for hypo, ref in zip(output_texts, target_texts): #ADD ROUGE if config_data.eval_metric == 'bleu': hypos.append(hypo) refs.append([ref]) elif config_data.eval_metric == 'rouge': hh = str(hypo) if (len(hh) == 0): hh = " " rr = ' '.join(ref) hypos.append(hh) refs.append(rr) #ADD ROUGE if config_data.eval_metric == 'bleu': return tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) elif config_data.eval_metric == 'rouge': rouge = Rouge() print('HH', type(hypos), type(hypos[0]), hypos[0]) print('RR', type(refs), type(refs[0]), refs[0]) return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
def rouge_eval2(ref_dir, dec_dir): output = [] for f in sorted(glob.glob(ref_dir + '/*')): with open(f, 'r') as file: output.append({'ref': ' '.join(file.readlines())}) for idx, f in enumerate(sorted(glob.glob(dec_dir + '/*'))): with open(f, 'r') as file: output[idx]['hyp'] = ' '.join(file.readlines()) hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in output])) rouge = Rouge() return rouge.get_scores(hyps, refs, avg=True)
def _get_rouge_score(sentence_a, sentence_b): rouge = Rouge() weight = [0.33, 0.33, 0.33] # 分别代表rouge-1, rouge-2, rouge-l所占比重。 sentence_a = " ".join([word for word in sentence_a]) sentence_b = " ".join([word for word in sentence_b]) try: scores = rouge.get_scores(sentence_a, sentence_b, avg=True) except Exception: return 0 rouge_1 = scores["rouge-1"] rouge_2 = scores["rouge-2"] rouge_l = scores["rouge-l"] f = rouge_1["f"] * weight[0] + rouge_2["f"] * weight[0] + rouge_l["f"] * weight[0] return f
def cal_Rouge(results, examples): rouge = Rouge(metrics=["rouge-l"]) pred_answers = [] answers = [] for example in examples: pred_answer = results[example.qid][1] answers.append(example.answer) pred_answers.append(pred_answer) scores = rouge.get_scores(pred_answers, answers, avg=True) Rouge_L = scores["rouge-l"]["f"] P = scores["rouge-l"]["p"] R = scores["rouge-l"]["r"] return Rouge_L, P, R
def eval(preds, targets, avg=True): rouge = Rouge() scores = rouge.get_scores(preds, targets, avg) rouge2_f_metric = scores['rouge-2']['f'] rouge2_p_metric = scores['rouge-2']['p'] rouge2_r_metric = scores['rouge-2']['r'] rougel_f_metric = scores['rouge-l']['f'] rougel_p_metric = scores['rouge-l']['p'] rougel_r_metric = scores['rouge-l']['r'] return rouge2_f_metric,rougel_p_metric, \ rouge2_r_metric,rouge2_f_metric, \ rouge2_p_metric,rougel_f_metric
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.prediction), len(self.referece)) if len(self.prediction) == 0 or len(self.referece) == 0: logger.error("During testing, no hyps or refers is selected!") return rouge = Rouge() scores_all = rouge.get_scores(self.prediction, self.referece, avg=True) if reset: self.prediction = [] self.referece = [] logger.info(scores_all) scores_all = remend_score(scores_all) return scores_all
def rouge_score(data): rouge = Rouge() scores = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []} for refs, sys in data: for score in rouge.get_scores([sys] * len(refs), refs): scores['rouge-1'].append(score['rouge-1']['f']) scores['rouge-2'].append(score['rouge-2']['f']) scores['rouge-l'].append(score['rouge-l']['f']) n = len(scores['rouge-1']) if n > 0: return sum(scores['rouge-1']) / n, sum(scores['rouge-2']) / n, sum( scores['rouge-l']) / n else: return 0, 0, 0
def evaluate(params): gen = test(params) reals = [] preds = [] with tqdm(total=params["max_num_to_eval"], position=0, leave=True) as pbar: for i in range(params["max_num_to_eval"]): trial = next(gen) reals.append(trial.real_abstract) preds.append(trial.abstract) pbar.update(1) r = Rouge() scores = r.get_scores(preds, reals, avg=True) print("\n\n") pprint.pprint(scores)
def compute_rouge_scores(pred_seq, target_seq): """ :param pred_seq: Predicted sequence :param target_seq: Target sequence :return: a pair (rouge_2, rouge_l) containing the rouge-2 and rouge-l scores given pred_seq and target_seq """ rouge = Rouge() pred_seq_str = ' '.join([str(x) for x in pred_seq]) target_seq = ' '.join([str(x) for x in target_seq]) scores = rouge.get_scores(pred_seq_str, target_seq) return scores[0]['rouge-2']['f'], scores[0]['rouge-l']['f']