def score_long_answer(gold_label_list, pred_label): """Scores a long answer as correct or not. 1) First decide if there is a gold long answer with LONG_NO_NULL_THRESHOLD. 2) The prediction will get a match if: a. There is a gold long answer. b. The prediction span match exactly with *one* of the non-null gold long answer span. Args: gold_label_list: A list of NQLabel, could be None. pred_label: A single NQLabel, could be None. Returns: gold_has_answer, pred_has_answer, is_correct, score """ gold_has_answer = util.gold_has_long_answer(gold_label_list) pred_has_answer = pred_label and ( not pred_label.long_answer_span.is_null_span()) is_correct = False score = pred_label.long_score # Both sides are non-null spans. if gold_has_answer and pred_has_answer: for gold_label in gold_label_list: # while the voting results indicate there is an long answer, each # annotator might still say there is no long answer. if gold_label.long_answer_span.is_null_span(): continue if util.nonnull_span_equal(gold_label.long_answer_span, pred_label.long_answer_span): is_correct = True break return gold_has_answer, pred_has_answer, is_correct, score
def score_long_answer(gold_label_list, pred_label): """Scores a long answer as correct or not. 1) First decide if there is a gold long answer with LONG_NO_NULL_THRESHOLD. 2) The prediction will get a match if: a. There is a gold long answer. b. The prediction span match exactly with *one* of the non-null gold long answer span. Args: gold_label_list: A list of NQLabel, could be None. pred_label: A single NQLabel, could be None. Returns: gold_has_answer, pred_has_answer, f1, score """ gold_has_answer = util.gold_has_long_answer(gold_label_list) pred_has_answer = pred_label and ( not pred_label.long_answer_span.is_null_span()) f1 = 0 p = 0 r = 0 score = pred_label.long_score # Both sides are non-null spans. if gold_has_answer and pred_has_answer: for gold_label in gold_label_list: # while the voting results indicate there is an long answer, each # annotator might still say there is no long answer. if gold_label.long_answer_span.is_null_span(): continue # Span: start_token_idx, end_token_idx # if util.nonnull_span_equal(gold_label.long_answer_span, # pred_label.long_answer_span): gold_span = gold_label.long_answer_span.start_token_idx, gold_label.long_answer_span.end_token_idx pred_span = pred_label.long_answer_span.start_token_idx, pred_label.long_answer_span.end_token_idx overlap_len = max( min(gold_span[1], pred_span[1]) - max(gold_span[0], pred_span[0]) + 1, 0) precision = overlap_len / (pred_span[1] - pred_span[0] + 1) recall = overlap_len / (gold_span[1] - gold_span[0] + 1) if safe_divide(2 * precision * recall, precision + recall) > f1: f1 = safe_divide(2 * precision * recall, precision + recall) p = precision r = recall elif not gold_has_answer and not pred_has_answer: f1 = 1 p = 1 r = 1 return gold_has_answer, pred_has_answer, f1, p, r, score
def label_to_pred(labels): """Convert a list of gold human annotations to a perfect prediction.""" gold_has_short_answer = util.gold_has_short_answer(labels) gold_has_long_answer = util.gold_has_long_answer(labels) # We did not put `long_answer` and `yes_no_answer`, and they should be # considered as null when loading from input. pred = { 'example_id': labels[0].example_id, 'short_answers': [], 'short_answers_score': random.random(), 'long_answer_score': random.random() } keep_answer = random.random() <= FLAGS.desired_recall for label in labels: if gold_has_short_answer and keep_answer: pred['short_answers_score'] *= 2 if not util.is_null_span_list(label.short_answer_span_list): pred['short_answers'] = ([{ 'start_token': span.start_token_idx, 'end_token': span.end_token_idx, 'start_byte': span.start_byte, 'end_byte': span.end_byte } for span in label.short_answer_span_list]) pred['yes_no_answer'] = 'none' elif label.yes_no_answer != 'none': pred['short_answers'] = [] pred['yes_no_answer'] = label.yes_no_answer if (gold_has_long_answer and not label.long_answer_span.is_null_span() and keep_answer): pred['long_answer'] = { 'start_token': label.long_answer_span.start_token_idx, 'end_token': label.long_answer_span.end_token_idx, 'start_byte': label.long_answer_span.start_byte, 'end_byte': label.long_answer_span.end_byte } pred['long_answer_score'] *= 2 if FLAGS.generate_false_positives: if not gold_has_short_answer: pred['short_answers'] = [{ 'start_token': 0, 'end_token': 1, 'start_byte': -1, 'end_byte': -1 }] if not gold_has_long_answer: pred['long_answer_start_token'] = 0 pred['long_answer_end_token'] = 1 return pred