Example #1
0
def get_metrics_input(examples,
                      preds,
                      no_answer_probs=None,
                      no_answer_probability_threshold=1.0):
    """"""

    pairs_answers = []
    pairs_predictions = []

    qas_id_to_has_answer = {
        example.qas_id: bool(example.answers)
        for example in examples
    }
    has_answer_qids = [
        qas_id for qas_id, has_answer in qas_id_to_has_answer.items()
        if has_answer
    ]
    no_answer_qids = [
        qas_id for qas_id, has_answer in qas_id_to_has_answer.items()
        if not has_answer
    ]

    if no_answer_probs is None:
        no_answer_probs = {k: 0.0 for k in preds}

    for example in examples:
        qas_id = example.qas_id
        gold_answers = [
            answer["text"] for answer in example.answers
            if normalize_answer(answer["text"])
        ]

        if not gold_answers:
            # For unanswerable questions, only correct answer is empty string
            gold_answers = [""]

        if qas_id not in preds:
            print("Missing prediction for %s" % qas_id)
            continue

        prediction = preds[qas_id]

        pairs_answers.append(
            ([normalize_answer(a) if a else prediction
              for a in gold_answers], qas_id))
        pairs_predictions.append(
            normalize_answer(prediction) if prediction else prediction)

    answers_data = [
        pairs_answers, qas_id_to_has_answer, has_answer_qids, no_answer_qids,
        no_answer_probs, no_answer_probability_threshold
    ]

    return answers_data, pairs_predictions
Example #2
0
 def get_answer_choice(self, raw_text):
     # Function maps answers to CoQA answer categories
     # ~ 1/5 of the CoQA answers are Yes/No
     # ~ 2/3 of the CoQA answers are span-based
     # (answers overlap with the passage ignoring punctuation and case mismatch)
     if raw_text == "unknown":
         return '0'
     if squad_metrics.normalize_answer(raw_text) == "yes":
         return '1'
     if squad_metrics.normalize_answer(raw_text) == "no":
         return '2'
     return '3'  # Not a yes/no question
Example #3
0
def gf_squad_evaluate(examples, preds):
    """
    Computes the exact and f1 scores from the examples and the model predictions
    """
    exact_scores = {}
    f1_scores = {}

    for example in examples:
        qas_id = example.qas_id
        gold_answers = [
            answer["text"] for answer in example.answers
            if normalize_answer(answer["text"])
        ]

        if not gold_answers:
            # For unanswerable questions, only correct answer is empty string
            gold_answers = [""]

        if qas_id not in preds:
            print("Missing prediction for %s" % qas_id)
            continue

        prediction = preds[qas_id]
        exact_scores[qas_id] = max(
            compute_exact(a, prediction) for a in gold_answers)
        f1_scores[qas_id] = max(
            ONE_METRIC_ROUGE_L(a, prediction) for a in gold_answers)

    return make_eval_dict(exact_scores, f1_scores)
def load_nbest(file_path, metrics, qid2answers):
    """Load (and clean) nbest file."""
    with open(file_path, "r") as f:
        data = json.load(f)

    all_examples = {}
    max_labels = 0
    for qid, nbest in data.items():
        # Skip if not in qid2answers.
        if qid not in qid2answers:
            continue

        # Resolve spans from multiple overlapping windows.
        text2label = {}
        for label in nbest:
            text = squad_metrics.normalize_answer(label["text"])
            label["text"] = text
            if text in text2label:
                if label["rerank_logit"] > text2label[text]["rerank_logit"]:
                    text2label[text] = label
            else:
                text2label[text] = label

        # Deduplicated
        labels = sorted(text2label.values(), key=lambda y: -y["probability"])

        # Skip if answer is not here.
        if not any([label["text"] in qid2answers[qid] for label in labels]):
            logging.info("Skipping example with no answer")
            continue

        # Renormalize probability over top-k and add rank.
        total_p = sum([y["probability"] for y in labels])
        start_p = scipy.special.softmax([y["start_logit"] for y in labels])
        end_p = scipy.special.softmax([y["end_logit"] for y in labels])
        for rank, y in enumerate(labels):
            y["probability"] /= total_p
            y["rank"] = rank
            y["sum"] = y["start_logit"] + y["end_logit"]
            y["start_prob"] = start_p[rank]
            y["end_prob"] = end_p[rank]
        all_examples[qid] = labels
        max_labels = max(len(labels), max_labels)

    logging.info("Maximum labels = %d", max_labels)
    return all_examples
def get_squad_gold_answers(file_path):
    """Return mapping of question ids to normalized answers."""
    qid2answers = {}
    with open(file_path, "r") as f:
        data = json.load(f)["data"]
    for article in data:
        for paragraph in article["paragraphs"]:
            for question in paragraph["qas"]:
                qid = question["id"]
                answers = set([
                    squad_metrics.normalize_answer(a["text"])
                    for a in question["answers"]
                ])
                if question.get("is_impossible"):
                    answers.add("")
                qid2answers[qid] = answers
    return qid2answers
Example #6
0
def get_raw_scores(answers, pred):
    exact_scores = {}
    precision_scores = {}
    recall_scores = {}
    f1_scores = {}

    for qas_id in answers.keys():
        gold_answers = [answer for answer in answers[qas_id] if squad_metrics.normalize_answer(answer)]

        if not gold_answers:
            gold_answers = [""]

        if qas_id not in pred:
            print("Missing prediction for %s" % qas_id)
            continue

        prediction = pred[qas_id]
        exact_scores[qas_id] = max(squad_metrics.compute_exact(a, prediction) for a in gold_answers)

        max_f1_score = None
        final_precision_score = None
        final_recall_score = None

        for a in gold_answers:
            precision_score, recall_score, f1_score = compute_f1(a, prediction)

            if max_f1_score is None or f1_score > max_f1_score:
                max_f1_score = f1_score
                final_precision_score = precision_score
                final_recall_score = recall_score

        precision_scores[qas_id] = final_precision_score
        recall_scores[qas_id] = final_recall_score
        f1_scores[qas_id] = max_f1_score

    return exact_scores, precision_scores, recall_scores, f1_scores