def test_rouge(self): evaluator = language_evaluation.RougeEvaluator(num_parallel_calls=5) sample_predictions = SAMPLE_PREDICTIONS * 5000 sample_answers = SAMPLE_ANSWERS * 5000 results = evaluator.run_evaluation(sample_predictions, sample_answers) #results = evaluator.run_evaluation(SAMPLE_PREDICTIONS, SAMPLE_ANSWERS) pprint(results)
def test_rouge(self): evaluator = language_evaluation.RougeEvaluator() results = evaluator.run_evaluation(SAMPLE_PREDICTIONS, SAMPLE_ANSWERS) pprint(results)
def run_wow_evaluation(results_dict, checkpoint_dir, mode): global_step = int(tf.compat.v1.train.get_global_step()) if 'episode_mask' in results_dict: episode_mask = results_dict['episode_mask'] else: episode_mask = None trim_fn = _trim_after_eos knowledge_separator = BERT_KNOWLEDGE_SEPARATOR predictions = trim_fn(results_dict['predictions'], mask=episode_mask) answers = trim_fn(results_dict['answers'], mask=episode_mask) contexts = trim_fn(results_dict['context'], mask=episode_mask) knowledge_sent_gts = trim_fn(results_dict['knowledge_sent_gt'], mask=episode_mask) knowledge_sent_preds = trim_fn(results_dict['knowledge_sent_pred'], mask=episode_mask) # XXX: Dump outputs # Show examples show_indices = random.sample(range(len(predictions)), 10) for index in show_indices: prediction = predictions[index] answer = answers[index] knowledge_sent_gt = knowledge_sent_gts[index] knowledge_sent_pred = knowledge_sent_preds[index] tqdm.write(f"{index} ({mode}).") tqdm.write(f"(knowledge_gt) {knowledge_sent_gt}") tqdm.write(f"(knowledge_pred) {knowledge_sent_pred}") tqdm.write(f"(gt) {answer}") tqdm.write(f"(pred) {prediction}\n\n") # Evaluation rouge_evaluator = language_evaluation.RougeEvaluator( num_parallel_calls=1, tokenization_fn=normalize_answer) perplexity = np.exp(np.mean(results_dict['gen_loss'])) total_loss = np.mean(results_dict['loss']) knowledge_accuracy = accuracy_score(np.zeros( results_dict['knowledge_predictions'].shape, dtype=np.int32), results_dict['knowledge_predictions'], sample_weight=episode_mask) rouge_result = rouge_evaluator.run_evaluation(predictions, answers) loss_result = { 'perplexity': perplexity, 'total_loss': total_loss, 'accuracy': knowledge_accuracy, } # Optional metrics if 'knowledge_loss' in results_dict: knowledge_loss = np.mean(results_dict['knowledge_loss']) loss_result['knowledge_loss'] = knowledge_loss if 'kl_loss' in results_dict: kl_loss = np.mean(results_dict['kl_loss']) loss_result['kl_loss'] = kl_loss log_dict = {} log_dict.update(rouge_result) log_dict.update(loss_result) summaries = { f"{mode}_test_loss": loss_result, f"{mode}_rouge": rouge_result } return summaries, log_dict
def add_multi_results(results_dict, rouge_result, loss_result, predictions, episode_mask, trim_fn): multi_responses = results_dict['multi_responses'] num_responses = results_dict['num_responses'][episode_mask] multi_gt_knowledge_sentences = results_dict['multi_gt_knowledge_sentences'] knowledge_sent_preds = results_dict['knowledge_sent_pred'][episode_mask] multi_rouge_evaluator = language_evaluation.RougeEvaluator( num_parallel_calls=1, tokenization_fn=normalize_answer, average=False) multi_rouge_results_list = [] multi_accuracy_list = [] for i in range(multi_responses.shape[1]): # choose best rouge scores among multi responses responses = trim_fn(multi_responses[:, i], mask=episode_mask) multi_rouge_result = multi_rouge_evaluator.run_evaluation( predictions, responses) multi_rouge_result['rouge1'][0] = multi_rouge_result['rouge1'][0] * ( num_responses > i) multi_rouge_result['rouge2'][0] = multi_rouge_result['rouge2'][0] * ( num_responses > i) multi_rouge_result['rougeL'][0] = multi_rouge_result['rougeL'][0] * ( num_responses > i) multi_rouge_results_list.append(multi_rouge_result) # knowledge accuracy gt_knowledge_sentences = multi_gt_knowledge_sentences[:, i][episode_mask] knowledge_min_length = min(gt_knowledge_sentences.shape[-1], knowledge_sent_preds.shape[-1]) multi_accuracy_list.append(np.logical_not(np.logical_not( gt_knowledge_sentences[:,:knowledge_min_length] == \ knowledge_sent_preds[:,:knowledge_min_length]).sum(axis=1))) multi_rouge1_results = np.stack( [x['rouge1'][0] for x in multi_rouge_results_list], axis=0) multi_rouge2_results = np.stack( [x['rouge2'][0] for x in multi_rouge_results_list], axis=0) multi_rougeL_results = np.stack( [x['rougeL'][0] for x in multi_rouge_results_list], axis=0) multi_rouge1_results = np.transpose(multi_rouge1_results, [1, 0]) multi_rouge2_results = np.transpose(multi_rouge2_results, [1, 0]) multi_rougeL_results = np.transpose(multi_rougeL_results, [1, 0]) multi_rouge1_max_indices = np.argmax(multi_rouge1_results, axis=1) max_multi_rouge1_results = np.max(multi_rouge1_results, axis=1) range_indices = np.arange(len(multi_rouge1_max_indices)) max_multi_rouge2_results = multi_rouge2_results[range_indices, multi_rouge1_max_indices] max_multi_rougeL_results = multi_rougeL_results[range_indices, multi_rouge1_max_indices] multi_rouge1 = sum(max_multi_rouge1_results) / len( max_multi_rouge1_results) multi_rouge2 = sum(max_multi_rouge2_results) / len( max_multi_rouge2_results) multi_rougeL = sum(max_multi_rougeL_results) / len( max_multi_rougeL_results) rouge_result['rouge1_multi_responses'] = multi_rouge1 rouge_result['rouge2_multi_responses'] = multi_rouge2 rouge_result['rougeL_multi_responses'] = multi_rougeL # accuracy multi_accuracies = np.transpose(np.stack(multi_accuracy_list, axis=0), [1, 0]) multi_accuracies = multi_accuracies.sum(axis=1).astype(bool) multi_accuracy = sum(multi_accuracies) / len(multi_accuracies) loss_result['accuracy_multi_responses'] = multi_accuracy # perplexity multi_perplexity = np.exp(np.mean(results_dict['multi_gen_loss'])) loss_result['perplexity_multi_responses'] = multi_perplexity return rouge_result, loss_result