def predict_sentence_ids(all_examples, all_features: List[SQuADFullInputFeatures], all_results: List[WeightResultChoice],
                             output_prediction_file=None,
                             weight_threshold: float = 0.0, only_correct: bool = False, label_threshold: float = 0.0):
        """
        :param all_results:
        :param all_examples:
        :param all_features:
        :param output_prediction_file:
        :param weight_threshold: The threshold for attention weights, only id predictions with a higher weight than this will
               be added.
        :param only_correct: If true, only id predictions with final choices predicted which are correct will be added.
                Otherwise, all the id predictions will be added no matter if the yes/no prediction is correct.
        :param label_threshold: Only make sense while only_correct=True, which means that only the id predictions with true
               yes/no prediction and the probability for yes/no is higher than this will be added.
        :return:
        """
        logger.info("Predicting sentence id to: %s" % output_prediction_file)
        logger.info("Weight threshold: {}".format(weight_threshold))
        logger.info("Use ids with true yes/no prediction only: {}".format(only_correct))
        logger.info("Yes/No prediction probability threshold : {}, only make sense while only_correct=True.".format(label_threshold))

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        total = 0
        no_label = 0
        correct = 0
        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            max_yes_logit = 0
            max_no_logit = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.choice_logits
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index
                    max_yes_logit = choice_logits[1]
                    max_no_logit = choice_logits[2]

            # if max_diff > null_score_diff_threshold:
            #     final_text = 'unknown'
            # Here we only consider questions with Yes/No as answers
            target_feature = features[max_diff_feature_index]
            sentence_id = unique_id_to_result[target_feature.unique_id].max_weight_index
            max_weight = unique_id_to_result[target_feature.unique_id].max_weight

            # Yes/No prediction restriction
            yesno_scores = utils.compute_softmax([max_yes_logit, max_no_logit])
            if max_yes_logit > max_no_logit:
                final_choice = 0
                prediction_prob = yesno_scores[0]
            else:
                final_choice = 1
                prediction_prob = yesno_scores[1]

            if only_correct:
                if (final_choice + 1) != target_feature.is_impossible:
                    sentence_id = -1
                if prediction_prob <= label_threshold:
                    sentence_id = -1
            if max_weight <= weight_threshold:
                sentence_id = -1

            feature_doc_span_index = target_feature.doc_span_index
            all_predictions[example.qas_id] = {'sentence_id': sentence_id,
                                               'doc_span_index': feature_doc_span_index,
                                               'weight': max_weight,
                                               'choice_prediction': 'yes' if final_choice == 0 else 'no',
                                               'prediction_prob': prediction_prob}

            total += 1
            if sentence_id == -1:
                no_label += 1
            else:
                if sentence_id == target_feature.sentence_id:
                    correct += 1

        logger.info("Labeling {} instances of {} in total".format(total - no_label, len(all_examples)))
        logger.info(
            "Labeling accuracy: Correct / Total: {} / {},  {}".format(correct, total - no_label, correct * 1.0 / (total - no_label + 1e-15)))

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return all_predictions
    def write_sentence_predictions(all_examples,
                                   all_features: List[QAFullInputFeatures],
                                   all_results: List[WeightResult],
                                   output_prediction_file=None,
                                   null_score_diff_threshold=0.0):
        logger.info("Writing evidence id predictions to: %s" %
                    output_prediction_file)

        sentence_pred = utils.AverageMeter()
        yesno_pred = utils.AverageMeter()

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.choice_logits
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index

            target_feature = features[max_diff_feature_index]
            target_result = unique_id_to_result[target_feature.unique_id]
            sentence_id_pred = target_feature.meta_data[
                'span_sen_to_orig_sen_map'][target_result.max_weight_index]

            target_choice_prob = utils.compute_softmax([
                target_result.choice_logits[1], target_result.choice_logits[2]
            ])
            if target_choice_prob[0] > target_choice_prob[1]:
                target_choice = 0
            else:
                target_choice = 1

            if target_choice == example.is_impossible:
                yesno_pred.update(1, 1)
            else:
                yesno_pred.update(0, 1)

            if sentence_id_pred == example.sentence_id:
                sentence_pred.update(1, 1)
            else:
                sentence_pred.update(0, 1)

            # Find top 5 sentences
            sentence_logits = target_result.sentence_logits
            sentence_num = len(target_feature.sentence_span_list)
            sentence_logits_item = [
                (i, x) for i, x in enumerate(sentence_logits[:sentence_num])
            ]

            # Find sentence
            gold_span_sen_id = -1
            for p_sen_idx, orig_sen_id in enumerate(
                    target_feature.meta_data["span_sen_to_orig_sen_map"]):
                if orig_sen_id == example.sentence_id:
                    gold_span_sen_id = p_sen_idx
            if gold_span_sen_id != -1:
                gold_span_sen_logit = target_result.sentence_logits[
                    gold_span_sen_id]
            else:
                gold_span_sen_logit = 0

            sorted_sentence_logits_item = sorted(sentence_logits_item,
                                                 key=lambda x: x[1],
                                                 reverse=True)
            top_5_sentences = [
                target_feature.meta_data["span_sen_to_orig_sen_map"][x[0]]
                for x in sorted_sentence_logits_item[:5]
            ]

            assert sentence_id_pred in top_5_sentences

            all_predictions[example.qas_id] = {
                'sentence_id':
                sentence_id_pred,
                'gold_sentence_logit':
                gold_span_sen_logit,
                'max_weight':
                target_result.max_weight,
                'max_weight_index':
                target_result.max_weight_index,
                'yesno_pred':
                target_choice,
                'yesno_gold':
                example.is_impossible,
                'sentence_logits':
                target_result.sentence_logits,
                'feature_gold_sentence':
                target_feature.sentence_id,
                'gold_sentence_id':
                example.sentence_id,
                'top_5_sentence_ids':
                top_5_sentences,
                'yesno_pred_prob':
                target_choice_prob,
                'doc_span_index':
                target_feature.doc_span_index,
                'span_sen_to_orig_sen_map':
                target_feature.meta_data['span_sen_to_orig_sen_map']
            }

        logger.info(f'Yesno Prediction Result: {yesno_pred.avg}')
        logger.info('Sentence Prediction Result: {}'.format(sentence_pred.avg))

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return {'predictions': all_predictions, 'acc': sentence_pred.avg}
    def predict_sentence_ids(all_examples,
                             all_features: List[SQuADFullInputFeatures],
                             all_results: List[RawOutput],
                             output_prediction_file=None,
                             weight_threshold: float = 0.0,
                             only_correct: bool = False,
                             label_threshold: float = 0.0):
        """
        :param all_results:
        :param all_examples:
        :param all_features:
        :param output_prediction_file:
        :param weight_threshold: The threshold for attention weights, only id predictions with a higher weight than this will
               be added.
        :param only_correct: If true, only id predictions with final choices predicted which are correct will be added.
               Otherwise, all the id predictions will be added no matter if the yes/no prediction is correct.
        :param label_threshold: Only make sense while only_correct=True, which means that only the id predictions with true
               yes/no prediction and the probability for yes/no is higher than this will be added.
        :return:
        """
        logger.info("Predicting sentence id to: %s" % output_prediction_file)
        logger.info("Weight threshold: {}".format(weight_threshold))
        logger.info("Use ids with true yes/no prediction only: {}".format(
            only_correct))
        logger.info(
            "Yes/No prediction probability threshold : {}, only make sense while only_correct=True."
            .format(label_threshold))

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        no_label = 0
        total = 0
        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            max_yes_logit = 0
            max_no_logit = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.model_output['choice_logits']
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index
                    max_yes_logit = choice_logits[1]
                    max_no_logit = choice_logits[2]

            # Yes/No prediction restriction
            yesno_scores = utils.compute_softmax([max_yes_logit, max_no_logit])
            if max_yes_logit > max_no_logit:
                final_choice = 0
                prediction_prob = yesno_scores[0]
            else:
                final_choice = 1
                prediction_prob = yesno_scores[1]

            target_feature = features[max_diff_feature_index]
            evidence = unique_id_to_result[
                target_feature.unique_id].model_output['evidence']
            evidence_value = evidence['value']
            if (only_correct and final_choice + 1 == target_feature.is_impossible and prediction_prob > label_threshold) \
                    or not only_correct:
                if evidence_value > weight_threshold:
                    sentence_id = evidence['sentences']
                else:
                    sentence_id = []
            else:
                sentence_id = []
            if not sentence_id:
                no_label += 1

            feature_doc_span_index = target_feature.doc_span_index
            all_predictions[example.qas_id] = {
                'sentence_id': sentence_id,
                'doc_span_index': feature_doc_span_index,
                'weight': evidence_value,
                'choice_prediction': 'yes' if final_choice == 0 else 'no',
                'prediction_prob': prediction_prob
            }

            total += 1

        logger.info("Labeling {} instances of {} in total".format(
            total - no_label, len(all_examples)))

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return all_predictions
Exemple #4
0
    def write_sentence_predictions(all_examples, all_features: List[QAFullInputFeatures], all_results: List[FullResult],
                                   output_prediction_file=None, weight_threshold: float = 0.0, label_threshold: float = 0.0):
        logger.info("Writing evidence id predictions to: %s" % output_prediction_file)

        sentence_pred_cnt = utils.AverageMeter()

        example_index_to_features = collections.defaultdict(list)
        for feature in all_features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for (example_index, example) in enumerate(all_examples):
            features = example_index_to_features[example_index]

            max_diff = -1000000
            max_diff_feature_index = 0
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature.unique_id]
                choice_logits = result.choice_logits
                non_null_logit = choice_logits[1] + choice_logits[2]
                null_logit = choice_logits[0]
                diff = non_null_logit - null_logit
                if diff > max_diff:
                    max_diff = diff
                    max_diff_feature_index = feature_index

            target_feature = features[max_diff_feature_index]
            target_result = unique_id_to_result[target_feature.unique_id]
            sentence_id_pred = target_feature.meta_data['span_sen_to_orig_sen_map'][target_result.max_weight_index]
            if target_result.max_weight <= weight_threshold:
                sentence_id_pred = -1

            target_choice_prob = utils.compute_softmax([target_result.choice_logits[1], target_result.choice_logits[2]])
            if target_choice_prob[0] > target_choice_prob[1]:
                target_choice = 0
            else:
                target_choice = 1
            if target_choice == example.is_impossible:
                if target_choice_prob[target_choice] <= label_threshold:
                    sentence_id_pred = -1
            else:
                sentence_id_pred = -1

            if sentence_id_pred != -1:
                sentence_pred_cnt.update(val=1, n=1)
            else:
                sentence_pred_cnt.update(val=0, n=1)
                continue

            all_predictions[example.qas_id] = {
                'sentence_id': sentence_id_pred,
                'max_weight': target_result.max_weight,
                'doc_span_index': target_feature.doc_span_index,
                'doc_span_sentence_id': target_result.max_weight_index
            }

        assert sentence_pred_cnt.sum == len(all_predictions)
        logger.info(f'Labeled evidence ids {sentence_pred_cnt.sum}/{sentence_pred_cnt.count} = {sentence_pred_cnt.avg} in total.')

        if output_prediction_file is not None:
            with open(output_prediction_file, 'w') as f:
                json.dump(all_predictions, f, indent=2)
        return all_predictions