Exemple #1
0
def score_long_answer(gold_label_list, pred_label):
    """Scores a long answer as correct or not.
  1) First decide if there is a gold long answer with LONG_NO_NULL_THRESHOLD.
  2) The prediction will get a match if:
     a. There is a gold long answer.
     b. The prediction span match exactly with *one* of the non-null gold
        long answer span.
  Args:
    gold_label_list: A list of NQLabel, could be None.
    pred_label: A single NQLabel, could be None.
  Returns:
    gold_has_answer, pred_has_answer, is_correct, score
  """
    gold_has_answer = util.gold_has_long_answer(gold_label_list)

    pred_has_answer = pred_label and (
        not pred_label.long_answer_span.is_null_span())

    is_correct = False
    score = pred_label.long_score

    # Both sides are non-null spans.
    if gold_has_answer and pred_has_answer:
        for gold_label in gold_label_list:
            # while the voting results indicate there is an long answer, each
            # annotator might still say there is no long answer.
            if gold_label.long_answer_span.is_null_span():
                continue

            if util.nonnull_span_equal(gold_label.long_answer_span,
                                       pred_label.long_answer_span):
                is_correct = True
                break

    return gold_has_answer, pred_has_answer, is_correct, score
Exemple #2
0
def score_long_answer(gold_label_list, pred_label):
    """Scores a long answer as correct or not.

  1) First decide if there is a gold long answer with LONG_NO_NULL_THRESHOLD.
  2) The prediction will get a match if:
     a. There is a gold long answer.
     b. The prediction span match exactly with *one* of the non-null gold
        long answer span.

  Args:
    gold_label_list: A list of NQLabel, could be None.
    pred_label: A single NQLabel, could be None.

  Returns:
    gold_has_answer, pred_has_answer, f1, score
  """
    gold_has_answer = util.gold_has_long_answer(gold_label_list)

    pred_has_answer = pred_label and (
        not pred_label.long_answer_span.is_null_span())

    f1 = 0
    p = 0
    r = 0
    score = pred_label.long_score

    # Both sides are non-null spans.
    if gold_has_answer and pred_has_answer:
        for gold_label in gold_label_list:
            # while the voting results indicate there is an long answer, each
            # annotator might still say there is no long answer.
            if gold_label.long_answer_span.is_null_span():
                continue
                # Span: start_token_idx, end_token_idx
                # if util.nonnull_span_equal(gold_label.long_answer_span,
                #                            pred_label.long_answer_span):

                gold_span = gold_label.long_answer_span.start_token_idx, gold_label.long_answer_span.end_token_idx
                pred_span = pred_label.long_answer_span.start_token_idx, pred_label.long_answer_span.end_token_idx

                overlap_len = max(
                    min(gold_span[1], pred_span[1]) -
                    max(gold_span[0], pred_span[0]) + 1, 0)
                precision = overlap_len / (pred_span[1] - pred_span[0] + 1)
                recall = overlap_len / (gold_span[1] - gold_span[0] + 1)

                if safe_divide(2 * precision * recall,
                               precision + recall) > f1:
                    f1 = safe_divide(2 * precision * recall,
                                     precision + recall)
                    p = precision
                    r = recall

    elif not gold_has_answer and not pred_has_answer:
        f1 = 1
        p = 1
        r = 1

    return gold_has_answer, pred_has_answer, f1, p, r, score
    def label_to_pred(labels):
        """Convert a list of gold human annotations to a perfect prediction."""
        gold_has_short_answer = util.gold_has_short_answer(labels)

        gold_has_long_answer = util.gold_has_long_answer(labels)

        # We did not put `long_answer` and `yes_no_answer`, and they should be
        # considered as null when loading from input.

        pred = {
            'example_id': labels[0].example_id,
            'short_answers': [],
            'short_answers_score': random.random(),
            'long_answer_score': random.random()
        }

        keep_answer = random.random() <= FLAGS.desired_recall
        for label in labels:
            if gold_has_short_answer and keep_answer:
                pred['short_answers_score'] *= 2
                if not util.is_null_span_list(label.short_answer_span_list):
                    pred['short_answers'] = ([{
                        'start_token': span.start_token_idx,
                        'end_token': span.end_token_idx,
                        'start_byte': span.start_byte,
                        'end_byte': span.end_byte
                    } for span in label.short_answer_span_list])
                    pred['yes_no_answer'] = 'none'
                elif label.yes_no_answer != 'none':
                    pred['short_answers'] = []
                    pred['yes_no_answer'] = label.yes_no_answer

            if (gold_has_long_answer
                    and not label.long_answer_span.is_null_span()
                    and keep_answer):
                pred['long_answer'] = {
                    'start_token': label.long_answer_span.start_token_idx,
                    'end_token': label.long_answer_span.end_token_idx,
                    'start_byte': label.long_answer_span.start_byte,
                    'end_byte': label.long_answer_span.end_byte
                }
                pred['long_answer_score'] *= 2

        if FLAGS.generate_false_positives:
            if not gold_has_short_answer:
                pred['short_answers'] = [{
                    'start_token': 0,
                    'end_token': 1,
                    'start_byte': -1,
                    'end_byte': -1
                }]

            if not gold_has_long_answer:
                pred['long_answer_start_token'] = 0
                pred['long_answer_end_token'] = 1

        return pred