Example #1
0
def cal_ROUGE(generated, reference, is_corpus=False):
    # ref and sample are both dict
    # scorers = [
    #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    #     (Meteor(),"METEOR"),
    #     (Rouge(), "ROUGE_L"),
    #     (Cider(), "CIDEr")
    # ]
    # output rouge 1-4 and rouge L and rouge L from pycocoevaluate

    ROUGEscore = [0.0] * 6
    for idx, g in enumerate(generated):
        score = [0.0] * 6
        if is_corpus:
            for order in range(4):
                score[order] = rouge_n(g.split(),
                                       [x.split() for x in reference[0]],
                                       order + 1, 0.5)
            score[4] = rouge_l(g.split(), [x.split() for x in reference[0]],
                               0.5)
            score[5], _ = Rouge().compute_score(reference, {0: [g]})

        else:
            for order in range(4):
                score[order] = rouge_n(g.split(), [reference[0][idx].split()],
                                       order + 1, 0.5)
            score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5)
            score[5], _ = Rouge().compute_score({0: [reference[0][idx]]},
                                                {0: [g]})
            #pdb.set_trace()
        #print g, score
        ROUGEscore = [r + score[idx] for idx, r in enumerate(ROUGEscore)]
        #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
    ROUGEscore = [r / len(generated) for r in ROUGEscore]
    return ROUGEscore
Example #2
0
def run_eval(sess, inputs, model):
    sess.run(inputs.initializer)
    total_losses = []
    total_sizes = []
    references = []
    candidates = []

    while True:
        try:
            loss, true_word, sample_word = sess.run(
                [model.loss, model.true_word, model.sample_word])
            total_losses.append(loss)
            total_sizes.append(len(sample_word))
            candidates.append(decode_for_human(sample_word, EOS, join=False))
            references.append(decode_for_human(true_word, EOS, join=False))

        except tf.errors.OutOfRangeError:
            avg_loss = get_avg_loss(total_losses, total_sizes)
            print("Evaluation done. Avg eval loss: %f" % avg_loss)

            candidates = list(chain(*candidates))
            references = list(chain(*references))
            #TODO add bleu and rouge
            print(rouge_1(candidates[1], [references[1]], [0, 0.5]))
            rouge_1_recall, rouge_1_F1 = list(
                np.mean([
                    rouge_1(c, [r], [0, 0.5])
                    for c, r in zip(candidates, references)
                ],
                        axis=0))
            rouge_2_recall, rouge_2_F1 = list(
                np.mean([
                    rouge_2(c, [r], [0, 0.5])
                    for c, r in zip(candidates, references)
                ],
                        axis=0))
            rouge_L_recall, rouge_L_F1 = list(
                np.mean([
                    rouge_l(c, [r], [0, 0.5])
                    for c, r in zip(candidates, references)
                ],
                        axis=0))

            print("rouge-1 recall: %.5f \t F1: %.5f" %
                  (rouge_1_recall, rouge_1_F1))
            print("rouge-2 recall: %.5f \t F1: %.5f" %
                  (rouge_2_recall, rouge_2_F1))
            print("rouge-L recall: %.5f \t F1: %.5f" %
                  (rouge_L_recall, rouge_L_F1))

            for c, r in random.sample(list(zip(candidates, references)), 20):
                print('\n'.join([
                    "candidate:" + ''.join(c), "reference:" + ''.join(r),
                    '------'
                ]))

            break

    return avg_loss
Example #3
0
    def _rouge_evaluation(self, predicts, answers):
        rouge_1s = []
        rouge_2s = []
        rouge_ls = []
        for predict, answer in zip(predicts, answers):
            answer = [w.replace('_UNK', '_UNKNOWN') for w in answer]

            rouge_1 = rougescore.rouge_1(predict, [answer], 0.5)
            rouge_2 = rougescore.rouge_2(predict, [answer], 0.5)
            rouge_l = rougescore.rouge_l(predict, [answer], 0.5)

            rouge_1s.append(rouge_1)
            rouge_2s.append(rouge_2)
            rouge_ls.append(rouge_l)

        return {
            "rouge_1": np.mean(rouge_1s),
            "rouge_2": np.mean(rouge_2s),
            "rouge_l": np.mean(rouge_ls)
        }
Example #4
0
 def test_rouge_l(self):
     data = self.load_test_data()
     rouge = RougeCalculator(stopwords=True)
     for eval_id in data:
         summaries = data[eval_id]["summaries"]
         references = data[eval_id]["references"]
         for s in summaries:
             baseline = Pythonrouge(summary_file_exist=False,
                                    summary=[[s]],
                                    reference=[[[r] for r in references]],
                                    n_gram=1,
                                    recall_only=False,
                                    ROUGE_L=True,
                                    length_limit=True,
                                    length=50,
                                    stemming=False,
                                    stopwords=True)
             b1_v = baseline.calc_score()
             b2_v = rouge_l(rouge.tokenize(s),
                            [rouge.tokenize(r) for r in references], 0.5)
             v = rouge.rouge_l(s, references)
             self.assertLess(abs(b2_v - v), 1e-5)
             self.assertLess(abs(b1_v["ROUGE-L-F"] - v), 1e-5)