def cal_ROUGE(generated, reference, is_corpus=False): # ref and sample are both dict # scorers = [ # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), # (Meteor(),"METEOR"), # (Rouge(), "ROUGE_L"), # (Cider(), "CIDEr") # ] # output rouge 1-4 and rouge L and rouge L from pycocoevaluate ROUGEscore = [0.0] * 6 for idx, g in enumerate(generated): score = [0.0] * 6 if is_corpus: for order in range(4): score[order] = rouge_n(g.split(), [x.split() for x in reference[0]], order + 1, 0.5) score[4] = rouge_l(g.split(), [x.split() for x in reference[0]], 0.5) score[5], _ = Rouge().compute_score(reference, {0: [g]}) else: for order in range(4): score[order] = rouge_n(g.split(), [reference[0][idx].split()], order + 1, 0.5) score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5) score[5], _ = Rouge().compute_score({0: [reference[0][idx]]}, {0: [g]}) #pdb.set_trace() #print g, score ROUGEscore = [r + score[idx] for idx, r in enumerate(ROUGEscore)] #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) ROUGEscore = [r / len(generated) for r in ROUGEscore] return ROUGEscore
def run_eval(sess, inputs, model): sess.run(inputs.initializer) total_losses = [] total_sizes = [] references = [] candidates = [] while True: try: loss, true_word, sample_word = sess.run( [model.loss, model.true_word, model.sample_word]) total_losses.append(loss) total_sizes.append(len(sample_word)) candidates.append(decode_for_human(sample_word, EOS, join=False)) references.append(decode_for_human(true_word, EOS, join=False)) except tf.errors.OutOfRangeError: avg_loss = get_avg_loss(total_losses, total_sizes) print("Evaluation done. Avg eval loss: %f" % avg_loss) candidates = list(chain(*candidates)) references = list(chain(*references)) #TODO add bleu and rouge print(rouge_1(candidates[1], [references[1]], [0, 0.5])) rouge_1_recall, rouge_1_F1 = list( np.mean([ rouge_1(c, [r], [0, 0.5]) for c, r in zip(candidates, references) ], axis=0)) rouge_2_recall, rouge_2_F1 = list( np.mean([ rouge_2(c, [r], [0, 0.5]) for c, r in zip(candidates, references) ], axis=0)) rouge_L_recall, rouge_L_F1 = list( np.mean([ rouge_l(c, [r], [0, 0.5]) for c, r in zip(candidates, references) ], axis=0)) print("rouge-1 recall: %.5f \t F1: %.5f" % (rouge_1_recall, rouge_1_F1)) print("rouge-2 recall: %.5f \t F1: %.5f" % (rouge_2_recall, rouge_2_F1)) print("rouge-L recall: %.5f \t F1: %.5f" % (rouge_L_recall, rouge_L_F1)) for c, r in random.sample(list(zip(candidates, references)), 20): print('\n'.join([ "candidate:" + ''.join(c), "reference:" + ''.join(r), '------' ])) break return avg_loss
def _rouge_evaluation(self, predicts, answers): rouge_1s = [] rouge_2s = [] rouge_ls = [] for predict, answer in zip(predicts, answers): answer = [w.replace('_UNK', '_UNKNOWN') for w in answer] rouge_1 = rougescore.rouge_1(predict, [answer], 0.5) rouge_2 = rougescore.rouge_2(predict, [answer], 0.5) rouge_l = rougescore.rouge_l(predict, [answer], 0.5) rouge_1s.append(rouge_1) rouge_2s.append(rouge_2) rouge_ls.append(rouge_l) return { "rouge_1": np.mean(rouge_1s), "rouge_2": np.mean(rouge_2s), "rouge_l": np.mean(rouge_ls) }
def test_rouge_l(self): data = self.load_test_data() rouge = RougeCalculator(stopwords=True) for eval_id in data: summaries = data[eval_id]["summaries"] references = data[eval_id]["references"] for s in summaries: baseline = Pythonrouge(summary_file_exist=False, summary=[[s]], reference=[[[r] for r in references]], n_gram=1, recall_only=False, ROUGE_L=True, length_limit=True, length=50, stemming=False, stopwords=True) b1_v = baseline.calc_score() b2_v = rouge_l(rouge.tokenize(s), [rouge.tokenize(r) for r in references], 0.5) v = rouge.rouge_l(s, references) self.assertLess(abs(b2_v - v), 1e-5) self.assertLess(abs(b1_v["ROUGE-L-F"] - v), 1e-5)