Exemple #1
0
def compute_macro_f1(answer_stats, prefix=''):
    """Computes F1, precision, recall for a list of answer scores.

  This computes the *language-wise macro F1*. For minimal answers,
  we also compute a partial match score that uses F1, which would be
  included in this computation via `answer_stats`.

  Args:
    answer_stats: List of per-example scores.
    prefix (''): Prefix to prepend to score dictionary.

  Returns:
    Dictionary mapping measurement names to scores.
  """

    has_gold, has_pred, f1, _ = list(zip(*answer_stats))

    macro_precision = eval_utils.safe_divide(sum(f1), sum(has_pred))
    macro_recall = eval_utils.safe_divide(sum(f1), sum(has_gold))
    macro_f1 = eval_utils.safe_divide(2 * macro_precision * macro_recall,
                                      macro_precision + macro_recall)

    return collections.OrderedDict({
        prefix + 'n': len(answer_stats),
        prefix + 'f1': macro_f1,
        prefix + 'precision': macro_precision,
        prefix + 'recall': macro_recall
    })
Exemple #2
0
def compute_pr_curves(answer_stats, targets=None):
    """Computes PR curve and returns R@P for specific targets.

  The values are computed as follows: find the (precision, recall) point
  with maximum recall and where precision > target.

  This is only relevant if you return the system scores in your predictions.
  You may find this useful when attempting to tune the threshold for your
  system on the dev set before requesting an evaluation on the test set
  via the leaderboard.

  Arguments:
    answer_stats: List of statistic tuples from the answer scores.
    targets (None): List of precision thresholds to target.

  Returns:
    List of table with rows: [target, r, p, score].
  """
    total_f1 = 0
    total_has_pred = 0
    total_has_gold = 0

    # Count the number of gold annotations.
    for has_gold, _, _, _ in answer_stats:
        total_has_gold += has_gold

    # Keep track of the point of maximum recall for each target.

    max_recall = [0 for _ in targets]
    max_precision = [0 for _ in targets]
    max_scores = [None for _ in targets]

    # Only keep track of unique thresholds in this dictionary.
    scores_to_stats = collections.OrderedDict()

    # Loop through every possible threshold and compute precision + recall.
    for has_gold, has_pred, is_correct_or_f1, score in answer_stats:
        if isinstance(is_correct_or_f1, tuple):
            _, _, f1 = is_correct_or_f1
        else:
            f1 = is_correct_or_f1
        total_f1 += f1
        total_has_pred += has_pred

        precision = eval_utils.safe_divide(total_f1, total_has_pred)
        recall = eval_utils.safe_divide(total_f1, total_has_gold)

        # If there are any ties, this will be updated multiple times until the
        # ties are all counted.
        scores_to_stats[score] = [precision, recall]

    best_f1 = 0.0
    best_precision = 0.0
    best_recall = 0.0
    best_threshold = 0.0

    for threshold, (precision, recall) in scores_to_stats.items():
        # Match the thresholds to the find the closest precision above some target.
        for t, target in enumerate(targets):
            if precision >= target and recall > max_recall[t]:
                max_recall[t] = recall
                max_precision[t] = precision
                max_scores[t] = threshold

        # Compute optimal threshold.
        f1 = eval_utils.safe_divide(2 * precision * recall, precision + recall)
        if f1 > best_f1:
            best_f1 = f1
            best_precision = precision
            best_recall = recall
            best_threshold = threshold

    return ((best_f1, best_precision, best_recall, best_threshold),
            list(zip(targets, max_recall, max_precision, max_scores)))