def main(args): err = None metrics = {} bleu4, rouge_l = 0.0, 0.0 alpha = args.ab beta = args.ab bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta) rouge_eval = RougeL(alpha=alpha, beta=beta, gamma=1.2) try: pred_result = read_file(args.pred_file) ref_result = read_file(args.ref_file, is_ref=True) for qid, results in ref_result.items(): cand_result = pred_result.get(qid, {}) #pred_answers = cand_result.get('answers', [EMPTY])[0] pred_answers = cand_result.get('answers', []) if not pred_answers: pred_answers = EMPTY else: pred_answers = pred_answers[0] pred_yn_label = None ref_entities = None ref_answers = results.get('answers', []) if not ref_answers: continue if results['question_type'] == 'ENTITY': ref_entities = set( itertools.chain(*results.get('entity_answers', [[]]))) if not ref_entities: ref_entities = None if results['question_type'] == 'YES_NO': cand_yesno = cand_result.get('yesno_answers', []) pred_yn_label = None if len(cand_yesno) == 0 \ else cand_yesno[0] bleu_eval.add_inst( pred_answers, ref_answers, yn_label=pred_yn_label, yn_ref=results['yesno_answers'], entity_ref=ref_entities) rouge_eval.add_inst( pred_answers, ref_answers, yn_label=pred_yn_label, yn_ref=results['yesno_answers'], entity_ref=ref_entities) bleu4 = bleu_eval.score()[-1] rouge_l = rouge_eval.score() except ValueError as ve: err = ve except AssertionError as ae: err = ae # too keep compatible to leaderboard evaluation. metrics['errorMsg'] = 'success' if err is None else err metrics['errorCode'] = 0 if err is None else 1 metrics['data'] = [ {'type': 'BOTH', 'name': 'ROUGE-L', 'value': round(rouge_l* 100, 2)}, {'type': 'BOTH', 'name': 'BLEU-4', 'value': round(bleu4 * 100, 2)}, ] print json.dumps(metrics, ensure_ascii=False).encode('utf8')
def main(args): err = None metrics = {} bleu4, rouge_l = 0.0, 0.0 alpha = args.alpha # default 1.0 beta = args.beta # default 1.0 bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta) rouge_eval = RougeLWithBonus(alpha=alpha, beta=beta, gamma=1.2) pred_result = read_file(args.pred_file) ref_result = read_file(args.ref_file, is_ref=True) bleu4, rouge_l = calc_metrics(pred_result, ref_result, bleu_eval, rouge_eval) metrics = { 'ROUGE-L': round(rouge_l * 100, 2), 'BLEU-4': round(bleu4 * 100, 2), } print json.dumps(metrics, ensure_ascii=False).encode('utf8')
def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle, args, logger, result_prefix=None): losses = [] pred_answers, ref_answers = [], [] padded_p_len = args.max_p_len for i in range(num_batches): qa_id, loss, start_probs, end_probs = sess.run( [model.qa_id, model.loss, model.logits1, model.logits2], feed_dict={handle: str_handle} if handle is not None else None) losses.append(loss) start, end = 0, 0 for id, start_prob, end_prob in zip(qa_id, start_probs, end_probs): best_p_idx, best_span, best_score = None, None, 0 sample = eval_file[str(id)] for p_idx, passage_len in enumerate(sample['passages_len']): if p_idx >= args.max_p_num: continue # 为每个passage找到best answer end = start + passage_len answer_span, score = find_best_answer_for_passage( start_prob[start:end], end_prob[start:end], passage_len, args.max_a_len) answer_span[0] += start answer_span[1] += start # 各passage间最大score if score > best_score: best_score = score best_p_idx = p_idx best_span = answer_span end = start # best_span = [start_prob, end_prob] # best_answer = sample['passages'][best_span[0]: best_span[1] + 1] # 根据span找到token if best_p_idx is None or best_span is None: best_answer = '' else: best_answer = ''.join( sample['passages'][best_span[0]:best_span[1] + 1]) # TODO 加入question tokens pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'yesno_answers': [] }) # 标准答案 # if 'answers' in sample and len(sample['answers']) > 0: if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'yesno_answers': [] }) if result_prefix is not None: result_file = os.path.join(args.result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') logger.info('Saving {} results to {}'.format(result_prefix, result_file)) avg_loss = np.mean(losses) bleu4, rouge_l = 0, 0 if len(ref_answers) > 0: # K-V 问题ID-答案 pred_dict, ref_dict, bleu_rouge = {}, {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: # 将answer tokens转换为由空格连接的一句话 pred_dict[question_id] = { 'answers': mrc_eval.normalize(pred['answers']), 'yesno_answers': [] } ref_dict[question_id] = { 'question_type': ref['question_type'], 'answers': mrc_eval.normalize(ref['answers']), 'yesno_answers': [] } bleu_eval = BLEUWithBonus(4, alpha=1.0, beta=1.0) rouge_eval = RougeLWithBonus(alpha=1.0, beta=1.0, gamma=1.2) bleu4, rouge_l = mrc_eval.calc_metrics(pred_dict, ref_dict, bleu_eval, rouge_eval) bleu_rouge['Bleu-4'] = bleu4 bleu_rouge['Rouge-L'] = rouge_l else: bleu_rouge = None loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=avg_loss), ]) bleu_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=bleu4), ]) rouge_sum = tf.Summary(value=[ tf.Summary.Value(tag="{}/em".format(data_type), simple_value=rouge_l), ]) return avg_loss, bleu_rouge, [loss_sum, bleu_sum, rouge_sum]