def write_sentence_predictions(all_examples,
                                   all_features: List[QAFullInputFeatures],
                                   all_results: List[WeightResult],
                                   output_prediction_file=None,
                                   null_score_diff_threshold=0.0):
        """Write final predictions to the json file and log-odds of null if needed."""
        logger.info("Writing predictions to: %s" % output_prediction_file)

        sentence_pred = utils.AverageMeter()

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_weight_logit = 0
            max_weight_feature_index = -1
            max_weight_index = -1
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                feature_max_weight = result.max_weight
                if feature_max_weight > max_weight_logit:
                    max_weight_logit = feature_max_weight
                    max_weight_feature_index = feature_index
                    max_weight_index = result.max_weight_index

            target_feature = features[max_weight_feature_index]
            sentence_id = target_feature.meta_data['span_sen_to_orig_sen_map'][
                max_weight_index]
            target_result = unique_id_to_result[target_feature.unique_id]
            all_predictions[example.qas_id] = {
                'sentence_id': sentence_id,
                'max_weight': max_weight_logit,
                'max_weight_index': max_weight_index,
                'sentence_logits': target_result.sentence_logits,
                'feature_gold_sentence': target_feature.sentence_id
            }

            if sentence_id == example.sentence_id:
                sentence_pred.update(1, 1)
            else:
                sentence_pred.update(0, 1)

        logger.info('Sentence Prediction Result: {}'.format(sentence_pred.avg))

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return {'predictions': all_predictions, 'acc': sentence_pred.avg}
    def read(self,
             input_file,
             dialog_turns: int = 2,
             remove_evidence=False,
             remove_question=False,
             remove_passage=False,
             remove_dict=None):
        """
        :param input_file: input file to load data. The format is in CoQA style
        :param dialog_turns:  Decide how many turns' questions and answers will be appended before current question.
        """
        logger.info('Reading data set from {}...'.format(input_file))
        logger.info('Read parameters:')
        logger.info('Dialog turns: {}'.format(dialog_turns))
        logger.info('Remove evidence during test: {}'.format(remove_evidence))
        logger.info('Remove dict: {}'.format(remove_dict))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(
                    ch) == 0x202F:
                return True
            return False

        if remove_dict is None:
            remove_dict = {}
        else:
            remove_dict = json.load(open(remove_dict, 'r'))
            logger.info(len(remove_dict))

        rule_labels_acc = utils.AverageMeter()

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["story"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(
                paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            questions = paragraph['questions']
            answers = paragraph['answers']
            for i, (question, answer) in enumerate(zip(questions, answers)):
                question_text = question['input_text']

                # We are only concerned about questions with Yes/No as answers
                answer_type = utils.normalize_answer(answer['input_text'])
                if answer_type not in ['yes', 'no']:
                    continue

                if story_id in remove_dict and str(i +
                                                   1) in remove_dict[story_id]:
                    continue

                if answer_type == 'yes':
                    answer_choice = 0
                else:
                    answer_choice = 1

                for j in range(dialog_turns):
                    pre_idx = i - (j + 1)
                    if pre_idx >= 0:
                        question_text = questions[pre_idx][
                            'input_text'] + '<Q>' + answers[pre_idx][
                                'input_text'] + '<A>' + question_text

                qas_id = story_id + '--' + str(i + 1)

                # Add rationale start and end as extra supervised label.
                rationale_start_position = char_to_word_offset[
                    answer['span_start']]
                rationale_end_position = char_to_word_offset[answer['span_end']
                                                             - 1]

                sentence_id = utils.find_evidence_sentence(
                    sentence_span_list, rationale_start_position,
                    rationale_end_position)

                # Remove evidence sentence for experiments while evaluation only.
                if remove_evidence and sentence_id != -1 and 'train' not in input_file:
                    evi_token_s, evi_token_e = sentence_span_list[sentence_id]
                    new_doc_tokens = doc_tokens[:evi_token_s] + doc_tokens[
                        (evi_token_e + 1):]
                    rationale_start_position = rationale_end_position = -1
                    reduce_offset = evi_token_e - evi_token_s + 1
                    new_sentence_span_list = sentence_span_list[:sentence_id] + [
                        (s - reduce_offset, e - reduce_offset)
                        for s, e in sentence_span_list[(sentence_id + 1):]
                    ]
                    sentence_id = -1
                else:
                    new_doc_tokens = doc_tokens
                    new_sentence_span_list = sentence_span_list

                if 'sentence_id' in question:
                    pseudo_sentence_id = question['sentence_id']
                    if pseudo_sentence_id == sentence_id:
                        rule_labels_acc.update(1)
                    else:
                        rule_labels_acc.update(0)
                    sentence_id = pseudo_sentence_id

                # example = SQuADFullExample(
                #     qas_id=qas_id,
                #     question_text=question_text,
                #     doc_tokens=doc_tokens,
                #     sentence_span_list=sentence_span_list,
                #     orig_answer_text="",
                #     start_position=None,
                #     end_position=None,
                #     sentence_id=sentence_id,
                #     is_impossible=answer_choice,
                #     ral_start_position=rationale_start_position,
                #     ral_end_position=rationale_end_position)
                example = SQuADFullExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=new_doc_tokens,
                    sentence_span_list=new_sentence_span_list,
                    orig_answer_text="",
                    start_position=None,
                    end_position=None,
                    sentence_id=sentence_id,
                    is_impossible=answer_choice,
                    ral_start_position=rationale_start_position,
                    ral_end_position=rationale_end_position)
                examples.append(example)

        if rule_labels_acc.count > 0:
            logger.info('Read labels generated by rules.')
            logger.info(f'Accuracy of labels: {rule_labels_acc.avg}')
        return examples
    def write_sentence_predictions(all_examples,
                                   all_features: List[QAFullInputFeatures],
                                   all_results: List[WeightResult],
                                   output_prediction_file=None,
                                   null_score_diff_threshold=0.0):
        logger.info("Writing evidence id predictions to: %s" %
                    output_prediction_file)

        sentence_pred = utils.AverageMeter()
        yesno_pred = utils.AverageMeter()

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.choice_logits
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index

            target_feature = features[max_diff_feature_index]
            target_result = unique_id_to_result[target_feature.unique_id]
            sentence_id_pred = target_feature.meta_data[
                'span_sen_to_orig_sen_map'][target_result.max_weight_index]

            target_choice_prob = utils.compute_softmax([
                target_result.choice_logits[1], target_result.choice_logits[2]
            ])
            if target_choice_prob[0] > target_choice_prob[1]:
                target_choice = 0
            else:
                target_choice = 1

            if target_choice == example.is_impossible:
                yesno_pred.update(1, 1)
            else:
                yesno_pred.update(0, 1)

            if sentence_id_pred == example.sentence_id:
                sentence_pred.update(1, 1)
            else:
                sentence_pred.update(0, 1)

            # Find top 5 sentences
            sentence_logits = target_result.sentence_logits
            sentence_num = len(target_feature.sentence_span_list)
            sentence_logits_item = [
                (i, x) for i, x in enumerate(sentence_logits[:sentence_num])
            ]

            # Find sentence
            gold_span_sen_id = -1
            for p_sen_idx, orig_sen_id in enumerate(
                    target_feature.meta_data["span_sen_to_orig_sen_map"]):
                if orig_sen_id == example.sentence_id:
                    gold_span_sen_id = p_sen_idx
            if gold_span_sen_id != -1:
                gold_span_sen_logit = target_result.sentence_logits[
                    gold_span_sen_id]
            else:
                gold_span_sen_logit = 0

            sorted_sentence_logits_item = sorted(sentence_logits_item,
                                                 key=lambda x: x[1],
                                                 reverse=True)
            top_5_sentences = [
                target_feature.meta_data["span_sen_to_orig_sen_map"][x[0]]
                for x in sorted_sentence_logits_item[:5]
            ]

            assert sentence_id_pred in top_5_sentences

            all_predictions[example.qas_id] = {
                'sentence_id':
                sentence_id_pred,
                'gold_sentence_logit':
                gold_span_sen_logit,
                'max_weight':
                target_result.max_weight,
                'max_weight_index':
                target_result.max_weight_index,
                'yesno_pred':
                target_choice,
                'yesno_gold':
                example.is_impossible,
                'sentence_logits':
                target_result.sentence_logits,
                'feature_gold_sentence':
                target_feature.sentence_id,
                'gold_sentence_id':
                example.sentence_id,
                'top_5_sentence_ids':
                top_5_sentences,
                'yesno_pred_prob':
                target_choice_prob,
                'doc_span_index':
                target_feature.doc_span_index,
                'span_sen_to_orig_sen_map':
                target_feature.meta_data['span_sen_to_orig_sen_map']
            }

        logger.info(f'Yesno Prediction Result: {yesno_pred.avg}')
        logger.info('Sentence Prediction Result: {}'.format(sentence_pred.avg))

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return {'predictions': all_predictions, 'acc': sentence_pred.avg}
Example #4
0
    def write_sentence_predictions(all_examples, all_features: List[QAFullInputFeatures], all_results: List[FullResult],
                                   output_prediction_file=None, weight_threshold: float = 0.0, label_threshold: float = 0.0):
        logger.info("Writing evidence id predictions to: %s" % output_prediction_file)

        sentence_pred_cnt = utils.AverageMeter()

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.choice_logits
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index

            target_feature = features[max_diff_feature_index]
            target_result = unique_id_to_result[target_feature.unique_id]
            sentence_id_pred = target_feature.meta_data['span_sen_to_orig_sen_map'][target_result.max_weight_index]
            if target_result.max_weight <= weight_threshold:
                sentence_id_pred = -1

            target_choice_prob = utils.compute_softmax([target_result.choice_logits[1], target_result.choice_logits[2]])
            if target_choice_prob[0] > target_choice_prob[1]:
                target_choice = 0
            else:
                target_choice = 1
            if target_choice == example.is_impossible:
                if target_choice_prob[target_choice] <= label_threshold:
                    sentence_id_pred = -1
            else:
                sentence_id_pred = -1

            if sentence_id_pred != -1:
                sentence_pred_cnt.update(val=1, n=1)
            else:
                sentence_pred_cnt.update(val=0, n=1)
                continue

            all_predictions[example.qas_id] = {
                'sentence_id': sentence_id_pred,
                'max_weight': target_result.max_weight,
                'doc_span_index': target_feature.doc_span_index,
                'doc_span_sentence_id': target_result.max_weight_index
            }

        assert sentence_pred_cnt.sum == len(all_predictions)
        logger.info(f'Labeled evidence ids {sentence_pred_cnt.sum}/{sentence_pred_cnt.count} = {sentence_pred_cnt.avg} in total.')

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return all_predictions