def evaluate_simple(eval_file, answer_dict): reference_corpus = [] translation_corpus = [] rouges = [] meteor = Meteor() res, gts = [], [] for key, answers in answer_dict.items(): answers = sorted(answers, key=lambda x: x[0], reverse=True) ground_truths = [ list(map(lambda x: x.lower(), eval_file[key]["question"][1:-1])) ] prediction = answers[0][1].lower().split() translation_corpus.append(prediction) reference_corpus.append(ground_truths) rouge = compute_rouge_L(prediction, ground_truths) rouges.append(rouge) res.append(' '.join(prediction)) gts.append([' '.join(ground_truth) for ground_truth in ground_truths]) bleu = compute_bleu(reference_corpus, translation_corpus) mete = meteor.compute_score(gts, res) return { "bleu": bleu[0] * 100, "meteor": mete[0] * 100, "rougeL": np.mean(rouges) * 100 }
def evaluate_full(gts, gen_res, gen_dict, gts_arc, emo_feat, method=None): assert (len(gts) == len(gen_res)) reward = 0 if method == "comet": assert (len(gen_dict) == len(gts_arc)) reward = compute_edit_distance_v2(gen_dict, gts_arc) elif method == "clf": pred_dist = np.zeros_like(emo_feat) for k, v in gen_dict.items(): pred_dist[k, :] = v reward = compute_emotion_distance(pred_dist, emo_feat, batch_normalize=True, method=method) elif method == 'clf_prob': pred_dist = np.zeros_like(emo_feat) for k, v in gen_dict.items(): pred_dist[k, :] = v reward = get_emotion_prob(pred_dist, gts_arc, batch_normalize=True) translation_corpus = gen_res reference_corpus = gts rouges = [] meteor = Meteor() bleu = compute_bleu([[j.split()] for j in reference_corpus], [i.split() for i in translation_corpus]) mete = meteor.compute_score([[r] for r in reference_corpus], translation_corpus) return { "bleu": bleu[0] * 100, "meteor": mete[0] * 100, "best_reward": reward }
def evaluate(eval_file, answer_dict): reference_corpus = [] translation_corpus = [] translation_corpus_rouge_oracle = [] translation_corpus_bleu_oracle = [] rouges = [] div_bleus = [] rouges_oracle = [] meteor = Meteor() res, res_oracle, gts = [], [], [] for key, answers in answer_dict.items(): answers = sorted(answers, key=lambda x: x[0], reverse=True) ground_truths = [ list(map(lambda x: x.lower(), eval_file[key]["question"][1:-1])) ] prediction = answers[0][1].lower().split() answers_tmp = [] for i, answer in enumerate(answers): rouge = compute_rouge_L(answer[1].lower().split(), ground_truths) mete = meteor.compute_score( [[' '.join(ground_truth) for ground_truth in ground_truths]], [' '.join(answer[1].lower().split())]) bleu = compute_bleu([ground_truths], [answer[1].lower().split()], smooth=True) answers_tmp.append((rouge, mete[0], bleu[0], answer[0], answer[1])) answers_rouge = sorted(answers_tmp, key=lambda x: x[0], reverse=True) answers_mete = sorted(answers_tmp, key=lambda x: x[1], reverse=True) answers_bleu = sorted(answers_tmp, key=lambda x: x[2], reverse=True) prediction_rouge_oracle = answers_rouge[0][4].lower().split() prediction_mete_oracle = answers_mete[0][4].lower().split() prediction_bleu_oracle = answers_bleu[0][4].lower().split() translation_corpus.append(prediction) translation_corpus_rouge_oracle.append(prediction_rouge_oracle) translation_corpus_bleu_oracle.append(prediction_bleu_oracle) reference_corpus.append(ground_truths) rouge = compute_rouge_L(prediction, ground_truths) rouge_oracle = compute_rouge_L(prediction_rouge_oracle, ground_truths) rouges.append(rouge) rouges_oracle.append(rouge_oracle) res.append(' '.join(prediction)) res_oracle.append(' '.join(prediction_mete_oracle)) gts.append([' '.join(ground_truth) for ground_truth in ground_truths]) div_bleus.append(diverse_bleu(answers)) bleu = compute_bleu(reference_corpus, translation_corpus) bleu_oracle = compute_bleu(reference_corpus, translation_corpus_bleu_oracle) mete = meteor.compute_score(gts, res) mete_oracle = meteor.compute_score(gts, res_oracle) return { "bleu": bleu[0] * 100, "meteor": mete[0] * 100, "rougeL": np.mean(rouges) * 100, "bleu_oracle": bleu_oracle[0] * 100, "meteor_oracle": mete_oracle[0] * 100, "rougeL_oracle": np.mean(rouges_oracle) * 100, "diverse_bleu": np.mean(div_bleus) * 100 }
def evaluate_rl(eval_file, qa_id, symbols, symbols_rl, id2word, metric="rouge"): meteor = Meteor() ques_limit = len(symbols) batch_size, _ = symbols[0].shape rewards = np.zeros([batch_size], dtype=np.float32) rewards_rl = np.zeros([batch_size], dtype=np.float32) rewards_base = np.zeros([batch_size], dtype=np.float32) ques_rl = np.zeros([batch_size, ques_limit], dtype=np.int32) for i, (qid, syms, syms_rl) in enumerate(zip(qa_id, zip(*symbols), zip(*symbols_rl))): syms = list(np.reshape(syms, [-1])) syms_rl = list(syms_rl) ground_truths = [ list( map(lambda x: x.lower(), eval_file[str(qid)]["question"][1:-1])) ] context_tokens = eval_file[str(qid)]["paragraph"] if 3 in syms: syms = syms[:syms.index(3)] if 3 in syms_rl: syms_rl = syms_rl[:syms_rl.index(3)] prediction = [ id2word[sym] if sym in id2word else context_tokens[sym - len(id2word)] for sym in syms ] prediction_rl = [ id2word[sym] if sym in id2word else context_tokens[sym - len(id2word)] for sym in syms_rl ] if metric == "rouge": reward_base = compute_rouge_L(prediction, ground_truths) reward_rl = compute_rouge_L(prediction_rl, ground_truths) elif metric == "bleu": reward_base, _ = compute_bleu([ground_truths], [prediction], smooth=True) reward_rl, _ = compute_bleu([ground_truths], [prediction_rl], smooth=True) elif metric == "meteor": reward_base = meteor.compute_score( [[' '.join(ground_truth) for ground_truth in ground_truths]], [' '.join(prediction)])[0] reward_rl = meteor.compute_score( [[' '.join(ground_truth) for ground_truth in ground_truths]], [' '.join(prediction_rl)])[0] else: print("Wrong Metric!") exit() rewards[i] = reward_rl - reward_base rewards_rl[i] = reward_rl rewards_base[i] = reward_base # add GO and EOS syms_rl = [2] + syms_rl[:ques_limit - 2] + [3] for j, sym_rl in enumerate(syms_rl): ques_rl[i, j] = sym_rl return rewards, rewards_rl, rewards_base, ques_rl