def remove_exact_duplicates(self, input_list):
        unique_qa_list = []
        unique_qa_dict = defaultdict(list)

        for qa in input_list:
            normalized_q = normalize(qa['question'])
            unique_qa_dict[normalized_q].append(qa)

        # Keep only first occurence for each question
        for _, duplicates in unique_qa_dict.items():
            unique_qa_list.append(duplicates[0])

        unique_qa_list.sort(key=lambda x: x['id'])

        return unique_qa_list
Beispiel #2
0
    def remove_exact_duplicate_questions(self, input_list):
        unique_qa_list = []
        unique_qa_dict = defaultdict(list)

        for qa in input_list:
            normalized_q = normalize(qa['question'])
            unique_qa_dict[normalized_q].append(qa)

        # Keep only first occurence for each question
        # Since answering model is seeing same (paragraph, question) input, answer will be same
        for _, duplicates in unique_qa_dict.items():
            unique_qa_list.append(duplicates[0])

        unique_qa_list.sort(key=lambda x: x['id'])

        return unique_qa_list
Beispiel #3
0
    def remove_exact_duplicate_answers(self,
                                       input_list,
                                       question_type="specific"):
        """Within each paragraph, ensure that every GENERAL / SPECIFIC predicted answer is unique."""
        unique_qa_list = []
        unique_qa_dict = defaultdict(list)

        if question_type == "specific":
            retain_type = "general"
        else:
            retain_type = "specific"

        for qa in input_list:
            # We don't want to be too aggressive about the unanswerable questions since the
            # answering module could have made mistakes. Unanswerable questions will be dealt
            # with later on in the filtering phase depending on the user settings.
            if qa['unanswerable'] is True or retain_type in qa["algorithm"]:
                unique_qa_list.append(qa)
                continue
            normalized_a = normalize(qa['predicted_answer'])
            unique_qa_dict[normalized_a].append(qa)

        metrics = {
            "general_sent": "recall_match",
            "specific_sent": "precision_match",
            "specific_entity": "recall_match"
        }

        # Now, sort the duplications according to the precision / recall scores with intended answer
        for _, duplicates in unique_qa_dict.items():
            duplicates.sort(
                key=lambda x: (-x[metrics[x['algorithm']]], x['id']))
            unique_qa_list.append(duplicates[0])

        unique_qa_list.sort(key=lambda x: x['id'])

        return unique_qa_list