Example #1
0
 def test_overlap_in_correct_cases(self):
     assert get_metrics(["Green bay packers"],
                        ["Green bay packers"]) == (1.0, 1.0)
     assert get_metrics(["Green bay", "packers"],
                        ["Green bay", "packers"]) == (1.0, 1.0)
     assert get_metrics(["Green", "bay", "packers"],
                        ["Green", "bay", "packers"]) == (1.0, 1.0)
Example #2
0
    def test_multi_span_overlap_in_incorrect_cases(self):
        # only consider bags with matching numbers if they are present
        assert get_metrics(["78-yard", "56", "28", "40", "44", "touchdown"],
                           ["78-yard", "56 yard", "1 yard touchdown"]) == (0.0, 0.56)

        # two copies of same value will account for only one match (using greedy 1-1 bag alignment)
        assert get_metrics(["23", "23 yard"],
                           ["23-yard", "56 yards"]) == (0.0, 0.5)

        # matching done at individual span level and not pooled into one global bag
        assert get_metrics(["John Karman", "Joe Hardy"],
                           ["Joe Karman", "John Hardy"]) == (0.0, 0.5)

        # macro-averaging F1 over spans
        assert get_metrics(["ottoman", "Kantakouzenous"],
                           ["ottoman", "army of Kantakouzenous"]) == (0.0, 0.75)
Example #3
0
 def evaluate_logical_form(self, logical_form: str,
                           answer_json: JsonDict) -> Tuple[float, float]:
     """
     Takes a logical form, and the answer dict from the original dataset, and returns exact match
     and f1 measures, according to the two official metrics.
     """
     answer_string, _ = drop_eval.answer_json_to_strings(answer_json)
     if not self.verbose:
         executor_logger = logging.getLogger(
             'semparse.language.drop_executor')
         executor_logger.setLevel(logging.ERROR)
     try:
         denotation = self.execute(logical_form)
     except Exception:  # pylint: disable=broad-except
         if self.verbose:
             LOGGER.warning(f'Failed to execute: {logical_form}')
         return 0.0, 0.0
     if isinstance(denotation, list):
         denotation_list = [
             str(denotation_item) for denotation_item in denotation
         ]
     else:
         denotation_list = [str(denotation)]
     em_score, f1_score = drop_eval.get_metrics(denotation_list,
                                                answer_string)
     return em_score, f1_score
Example #4
0
 def test_simple_overlap_in_incorrect_cases(self):
     assert get_metrics([""], ["army"]) == (0.0, 0.0)
     assert get_metrics(["packers"], ["Green bay packers"]) == (0.0, 0.5)
     assert get_metrics(["packers"], ["Green bay"]) == (0.0, 0.0)
     # if the numbers in the span don't match f1 is 0
     assert get_metrics(["yard"], ["36 yard td"]) == (0.0, 0.0)
     assert get_metrics(["23 yards"], ["43 yards"]) == (0.0, 0.0)
     # however, if number matches its not given extra weight over the non-functional words
     assert get_metrics(["56 yards"], ["56 yd"]) == (0.0, 0.5)
     assert get_metrics(["26"], ["26 yard td"]) == (0.0, 0.5)
Example #5
0
    def test_metric_is_length_aware(self):
        # Overall F1 should be mean([1.0, 0.0])
        assert get_metrics(predicted=["td"], gold=["td", "td"]) == (0.0, 0.5)
        assert get_metrics("td", ["td", "td"]) == (0.0, 0.5)
        # Overall F1 should be mean ([1.0, 0.0]) = 0.5
        assert get_metrics(predicted=["td", "td"], gold=["td"]) == (0.0, 0.5)
        assert get_metrics(predicted=["td", "td"], gold="td") == (0.0, 0.5)

        # F1 score is mean([0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
        assert get_metrics(predicted=["the", "fat", "cat", "the fat", "fat cat", "the fat cat"],
                           gold=["cat"]) == (0.0, 0.17)
        assert get_metrics(predicted=["cat"],
                           gold=["the", "fat", "cat", "the fat", "fat cat", "the fat cat"]) == (0.0, 0.17)
        # F1 score is mean([1.0, 0.5, 0.0, 0.0, 0.0, 0.0])
        assert get_metrics(predicted=["the", "fat", "cat", "the fat", "fat cat", "the fat cat"],
                           gold=["cat", "cat dog"]) == (0.0, 0.25)
Example #6
0
def evaluate_json(annotations: Dict[str, Any],
                  predicted_answers: Dict[str, Any]) -> Tuple[float, float]:
    """
    Takes gold annotations and predicted answers and  evaluates the predictions for each question
    in the gold annotations.  Both JSON dictionaries must have query_id keys, which are used to
    match predictions to gold annotations.

    The ``predicted_answers`` JSON must be a dictionary keyed by query id, where the value is a
    list of strings (or just one string) that is the answer.
    The ``annotations`` are assumed to have either the format of the dev set in the Quoref data release, or the
    same format as the predicted answers file.
    """
    instance_exact_match = []
    instance_f1 = []
    if "data" in annotations:
        # We're looking at annotations in the original data format. Let's extract the answers.
        annotated_answers = _get_answers_from_data(annotations)
    else:
        annotated_answers = annotations
    for query_id, candidate_answers in annotated_answers.items():
        max_em_score = 0.0
        max_f1_score = 0.0
        if query_id in predicted_answers:
            predicted = predicted_answers[query_id]
            gold_answer = tuple(candidate_answers)
            em_score, f1_score = drop_eval.get_metrics(predicted, gold_answer)
            if gold_answer[0].strip() != "":
                max_em_score = max(max_em_score, em_score)
                max_f1_score = max(max_f1_score, f1_score)
        else:
            print("Missing prediction for question: {}".format(query_id))
            max_em_score = 0.0
            max_f1_score = 0.0
        instance_exact_match.append(max_em_score)
        instance_f1.append(max_f1_score)

    global_em = np.mean(instance_exact_match)
    global_f1 = np.mean(instance_f1)
    print("Exact-match accuracy {0:.2f}".format(global_em * 100))
    print("F1 score {0:.2f}".format(global_f1 * 100))
    print("{0:.2f}   &   {1:.2f}".format(global_em * 100, global_f1 * 100))
    return global_em, global_f1
def get_instance_metrics(annotations: Dict[str, Any],
                         predicted_answers: Dict[str, Any]) -> Dict[str, Tuple[float, float]]:
    """
    Takes gold annotations and predicted answers and  evaluates the predictions for each question
    in the gold annotations.  Both JSON dictionaries must have query_id keys, which are used to
    match predictions to gold annotations.

    The ``predicted_answers`` JSON must be a dictionary keyed by query id, where the value is a
    list of strings (or just one string) that is the answer.
    The ``annotations`` are assumed to have either the format of the dev set in the Quoref data release, or the
    same format as the predicted answers file.
    """
    instance_metrics: Dict[str, Tuple[float, float]] = {}
    if "data" in annotations:
        # We're looking at annotations in the original data format. Let's extract the answers.
        annotated_answers, questions_dict = _get_questions_and_answers_from_data(annotations)
    else:
        questions_dict = None
        annotated_answers = annotations
    for query_id, candidate_answers in annotated_answers.items():
        max_em_score = 0.0
        max_f1_score = 0.0
        if query_id in predicted_answers:
            predicted = predicted_answers[query_id]
            gold_answer = tuple(candidate_answers)
            em_score, f1_score = drop_eval.get_metrics(predicted, gold_answer)
            if gold_answer[0].strip() != "":
                max_em_score = max(max_em_score, em_score)
                max_f1_score = max(max_f1_score, f1_score)
        else:
            print("Missing prediction for question: {}".format(query_id))
            max_em_score = 0.0
            max_f1_score = 0.0
        instance_metrics[query_id] = max_em_score, max_f1_score

    return instance_metrics, questions_dict
Example #8
0
 def test_casing_is_ignored(self):
     assert get_metrics(["This was a triumph"],
                        ["tHIS Was A TRIUMPH"]) == (1.0, 1.0)
Example #9
0
 def test_splitting_on_hyphens(self):
     assert get_metrics(["78-yard"], ["78 yard"]) == (1.0, 1.0)
     assert get_metrics(["78 yard"], ["78-yard"]) == (1.0, 1.0)
     assert get_metrics(["78"], ["78-yard"]) == (0.0, 0.67)
     assert get_metrics(["78-yard"], ["78"]) == (0.0, 0.67)
Example #10
0
 def test_periods_commas_and_spaces_are_ignored(self):
     assert get_metrics(["Per.i.o.d...."],
                        [".P....e.r,,i;;;o...d,,"]) == (1.0, 1.0)
     assert get_metrics(["Spa     c  e   s     "],
                        ["    Spa c     e s"]) == (1.0, 1.0)
Example #11
0
 def test_f1_ignores_word_order(self):
     assert get_metrics(["John Elton"], ["Elton John"]) == (0.0, 1.0)
     assert get_metrics(["50 yard"], ["yard 50"]) == (0.0, 1.0)
     assert get_metrics(["order word right"],
                        ["right word order"]) == (0.0, 1.0)
Example #12
0
 def test_articles_are_ignored(self):
     assert get_metrics(["td"], ["the td"]) == (1.0, 1.0)
     assert get_metrics(["the a NOT an ARTICLE the an a"],
                        ["NOT ARTICLE"]) == (1.0, 1.0)
Example #13
0
 def test_float_numbers(self):
     assert get_metrics(["78"], ["78.0"]) == (1.0, 1.0)
Example #14
0
 def test_order_invariance(self):
     assert get_metrics(["a"], ["a", "b"]) == (0, 0.5)
     assert get_metrics(["b"], ["a", "b"]) == (0, 0.5)
     assert get_metrics(["b"], ["b", "a"]) == (0, 0.5)