Exemple #1
0
def qa(targets, predictions):
    """Computes question answering metrics, maximizing over answers per question.

  Args:
    targets: list of lists of strings
    predictions: list of strings

  Returns:
    dict with score_key: squad score across all targets and predictions
  """
    assert len(targets) == len(predictions)
    targets = [[tf.compat.as_text(t) for t in u] for u in targets]
    predictions = [tf.compat.as_text(p) for p in predictions]
    em = np.mean([
        squad_eval.metric_max_over_ground_truths(  # pylint:disable=g-complex-comprehension
            squad_eval.exact_match_score, p, t)
        for p, t in zip(predictions, targets)
    ])
    f1 = np.mean([
        squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, p, t)
        for p, t in zip(predictions, targets)
    ])
    em *= 100
    f1 *= 100
    logging.info("EM = %.2f, F1 = %.2f", em, f1)
    return {"em": em, "f1": f1}
Exemple #2
0
def calc_em_and_f1(best_span_string, answer_strings):
    exact_match = squad_eval.metric_max_over_ground_truths(
            squad_eval.exact_match_score,
            best_span_string,
            answer_strings)
    f1_score = squad_eval.metric_max_over_ground_truths(
            squad_eval.f1_score,
            best_span_string,
            answer_strings)
    return exact_match, f1_score
Exemple #3
0
 def __call__(self, best_span_string, answer_strings):
     """
     Parameters
     ----------
     value : ``float``
         The value to average.
     """
     exact_match = squad_eval.metric_max_over_ground_truths(
         squad_eval.exact_match_score, best_span_string, answer_strings)
     f1_score = squad_eval.metric_max_over_ground_truths(
         squad_eval.f1_score, best_span_string, answer_strings)
     self._total_em += exact_match
     self._total_f1 += f1_score
     self._count += 1
Exemple #4
0
 def _compute_f1_score(self, pred_tokens, target_tokens):
     for i in range(len(target_tokens)):
         pred_str = self._get_string_from_tokens(pred_tokens[i])
         target_str = self._get_string_from_tokens(target_tokens[i])
         f1_score = squad_eval.metric_max_over_ground_truths(
             squad_eval.f1_score, pred_str, [target_str])
         self._official_f1(100 * f1_score)
Exemple #5
0
            def max_prec(st_gold, en_gold, st_pred, en_pred, cannotanswer, cannotanswer_pred, ctx_text):
                batch_size = ans_mask.size(0)
                max_turns = batch_size // len(ctx_text)

                prec = []

                cannotanswer_mask = (cannotanswer_pred).cpu()

                for idx, (sg, eg, sp, ep, cm) in enumerate(zip(st_gold.cpu(), en_gold.cpu(), st_pred.cpu(), en_pred.cpu(), cannotanswer_mask)):
                    if cm:
                        prec.append(1)
                        continue
                    ct = ctx_text[idx // max_turns]
                    p = ct[sp:ep+1]

                    golds = []
                    for j in range(ans_mask.size(-1)):
                        if eg[j] < 0:
                            break
                        golds.append(' '.join(ct[sg[j]:eg[j]+1]))

                    if len(golds) == 0:
                        prec.append(0)
                    else:
                        prec.append(squad_eval.metric_max_over_ground_truths(cached_prec, ' '.join(p), golds))

                prec = torch.Tensor(prec).to(ans_mask.device)

                return prec
Exemple #6
0
    def search_qa(self, token_inflections, orig_tokenized, original_loss, question_dict, context, conservative=True, backward=False):
        perturbed_tokenized = orig_tokenized.copy()

        max_loss = original_loss
        num_queries = 0
        max_predicted = ''

        if backward:
            token_inflections = reversed(token_inflections)

        detokenizer = MosesDetokenizer(lang='en')
        
        for curr_token in token_inflections:
            max_infl = orig_tokenized[curr_token[0]]
            for infl in curr_token[1]:
                perturbed_tokenized[curr_token[0]] = infl
                perturbed = detokenizer.detokenize(perturbed_tokenized)
                loss, predicted = self.get_loss(perturbed, question_dict, context)
                num_queries += 1
                if loss > max_loss:
                    max_loss = loss
                    max_infl = infl
                    max_predicted = predicted
                if conservative and metric_max_over_ground_truths(compute_f1, predicted, question_dict['gold_texts']) == 0:
                    break
            perturbed_tokenized[curr_token[0]] = max_infl
        return perturbed_tokenized, max_loss, max_predicted, num_queries
    def attack_one(self, question_dict, context, constrain_pos=True):
        original = question_dict['question']

        gold_starts = [ans['answer_start'] for ans in question_dict['answers']]
        gold_texts = [ans['text'] for ans in question_dict['answers']]
        gold_ends = [
            gold_starts[i] + len(text) for i, text in enumerate(gold_texts)
        ]
        question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends))
        question_dict['gold_texts'] = gold_texts

        orig_tokenized = MosesTokenizer(lang='en').tokenize(original)

        pos_tagged = [
            (tagged[0], '.') if '&' in tagged[0] else tagged
            for tagged in nltk.pos_tag(orig_tokenized, tagset='universal')
        ]

        token_inflections = super(LocalSearchQA, self).get_inflections(
            orig_tokenized, pos_tagged, constrain_pos)

        original_loss, init_predicted = self.get_loss(original, question_dict,
                                                      context)

        # skip too long or too short question
        if len(orig_tokenized) < 10 or len(orig_tokenized) > 100:
            return original, 0, None, None

        # skip wrong predict question
        if metric_max_over_ground_truths(compute_f1, init_predicted,
                                         question_dict['gold_texts']) == 0:
            return original, 1, None, None

        _perturbed, _loss, _predicted, _num_queries = self.local_search_qa(
            token_inflections, orig_tokenized, original_loss, question_dict,
            context)
        is_attack_success = False
        if metric_max_over_ground_truths(compute_f1, _predicted,
                                         question_dict['gold_texts']) == 0:
            is_attack_success = True

        num_queries = 1 + _num_queries
        modif_rate = self.get_modif_rate(orig_tokenized, _perturbed)
        return MosesDetokenizer(lang='en').detokenize(
            _perturbed), num_queries, modif_rate, is_attack_success
Exemple #8
0
 def __call__(self, best_span_string, answer_strings):
     """
     Parameters
     ----------
     value : ``float``
         The value to average.
     """
     exact_match = squad_eval.metric_max_over_ground_truths(
             squad_eval.exact_match_score,
             best_span_string,
             answer_strings)
     f1_score = squad_eval.metric_max_over_ground_truths(
             squad_eval.f1_score,
             best_span_string,
             answer_strings)
     self._total_em += exact_match
     self._total_f1 += f1_score
     self._count += 1
Exemple #9
0
def f1metric(prediction: Union[str, List],
             ground_truths: List):  # type: ignore
    """
    Parameters
    ----------a
    prediction: ``Union[str, List]``
        The predicted answer from the model evaluated. This could be a string, or a list of string
        when multiple spans are predicted as answer.
    ground_truths: ``List``
        All the ground truth answer annotations.
    """
    # If you wanted to split this out by answer type, you could look at [1] here and group by
    # that, instead of only keeping [0].
    ground_truth_answer_strings = [
        answer_json_to_strings(annotation)[0] for annotation in ground_truths
    ]
    exact_match, f1_score = metric_max_over_ground_truths(
        drop_em_and_f1, prediction, ground_truth_answer_strings)

    return (exact_match, f1_score)
Exemple #10
0
    def morph(self, question_dict, context, constrain_pos=False, conservative=False):
        original = question_dict['question']

        gold_starts = [ans['answer_start'] for ans in question_dict['answers']]
        gold_texts = [ans['text'] for ans in question_dict['answers']]
        gold_ends = [gold_starts[i]+len(text) for i, text in enumerate(gold_texts)]
        question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends))
        question_dict['gold_texts'] = gold_texts

        orig_tokenized = MosesTokenizer(lang='en').tokenize(original)
        
        pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_tokenized,tagset='universal')]

        token_inflections = super(MorpheusQA, self).get_inflections(orig_tokenized, pos_tagged, constrain_pos)

        original_loss, _ = self.get_loss(original, question_dict, context)
        
        forward_perturbed, forward_loss, forward_predicted, num_queries_forward = self.search_qa(token_inflections,
                                                                           orig_tokenized,
                                                                           original_loss,
                                                                           question_dict,
                                                                           context,
                                                                           conservative)
        
        if conservative and metric_max_over_ground_truths(compute_f1, forward_predicted, question_dict['gold_texts']) == 0:
            return MosesDetokenizer(lang='en').detokenize(forward_perturbed), num_queries_forward + 1
        
        backward_perturbed, backward_loss, __, num_queries_backward = self.search_qa(token_inflections,
                                                                              orig_tokenized,
                                                                              original_loss,
                                                                              question_dict,
                                                                              context,
                                                                              conservative,
                                                                              backward=True)

        
        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_loss > backward_loss:
            return MosesDetokenizer(lang='en').detokenize(forward_perturbed), num_queries
        else:
            return MosesDetokenizer(lang='en').detokenize(backward_perturbed), num_queries
 def __call__(self, prediction: Union[str, List], ground_truths: List):
     """
     Parameters
     ----------
     prediction: ``Union[str, List]``
         The predicted answer from the model evaluated. This could be a string, or a list of string
         when multiple spans are predicted as answer.
     ground_truths: ``List``
         All the ground truth answer annotations.
     """
     ground_truth_answer_strings = [
         convert_annotation_to_string(annotation)[0]
         for annotation in ground_truths
     ]
     # pylint: disable=unused-variable
     ground_truth_answer_types = [
         convert_annotation_to_string(annotation)[1]
         for annotation in ground_truths
     ]
     exact_match, f1_score = metric_max_over_ground_truths(
         drop_em_and_f1, prediction, ground_truth_answer_strings)
     self._total_em += exact_match
     self._total_f1 += f1_score
     self._count += 1
    def local_search_qa(self, token_inflections, orig_tokenized, original_loss,
                        question_dict, context):
        perturbed_tokenized = orig_tokenized.copy()  # token list (list of str)

        max_loss = original_loss
        num_queries = 0
        max_predicted = ''

        detokenizer = MosesDetokenizer(lang='en')

        while True:
            new_tokenized_list = []
            # new_text_list = []
            new_loss_list = []
            new_predicted_list = []

            for position, candidates in token_inflections:  # list of pairs (position, candidates) candidates: list of token
                # add or swap
                for infl in candidates:

                    if perturbed_tokenized[position] == infl:
                        continue

                    # do replace
                    new_tokenized = perturbed_tokenized.copy()
                    new_tokenized[position] = infl
                    # form text and eval
                    new_text = detokenizer.detokenize(new_tokenized)
                    new_loss, new_predicted = self.get_loss(
                        new_text, question_dict, context)
                    num_queries += 1

                    # record
                    new_tokenized_list.append(new_tokenized)
                    new_loss_list.append(new_loss)
                    new_predicted_list.append(new_predicted)

                # remove
                if perturbed_tokenized[position] != orig_tokenized[position]:
                    # do replace
                    new_tokenized = perturbed_tokenized.copy()
                    new_tokenized[position] = orig_tokenized[position]

                    # form text and eval
                    new_text = detokenizer.detokenize(new_tokenized)
                    new_loss, new_predicted = self.get_loss(
                        new_text, question_dict, context)
                    num_queries += 1

                    # record
                    new_tokenized_list.append(new_tokenized)
                    new_loss_list.append(new_loss)
                    new_predicted_list.append(new_predicted)

            if len(new_loss_list) == 0:  # no improve
                break

            cur_max_idx = np.argsort(new_loss_list)[-1]
            cur_max_loss = new_loss_list[cur_max_idx]
            cur_max_predicted = new_predicted_list[cur_max_idx]
            # cur_max_text = new_text_list[cur_max_idx]
            cur_max_tokenized = new_tokenized_list[cur_max_idx]

            # check stop criteria
            if metric_max_over_ground_truths(compute_f1, cur_max_predicted,
                                             question_dict['gold_texts']) == 0:
                perturbed_tokenized = cur_max_tokenized
                max_loss = cur_max_loss
                max_predicted = cur_max_predicted
                break

            if cur_max_loss > max_loss + EPSILON:
                perturbed_tokenized = cur_max_tokenized
                max_loss = cur_max_loss
                max_predicted = cur_max_predicted
            else:
                break

        # =============== check supplement set ======================
        # form supplement set
        supplement_inflections_by_position = {
            position: []
            for position, _ in token_inflections
        }
        for position, candidates in token_inflections:
            for infl in candidates:
                if perturbed_tokenized[position] != infl:
                    supplement_inflections_by_position[position].append(infl)

        is_sup_valid = True
        valid_positions = []
        for position, _ in token_inflections:
            if len(supplement_inflections_by_position[position]) > 1:
                is_sup_valid = False
                break
            if len(supplement_inflections_by_position[position]) == 1:
                valid_positions.append(position)

        if len(valid_positions) == 0:
            is_sup_valid = False

        if is_sup_valid:
            print('check supplement')
            supplement_tokenized = perturbed_tokenized.copy()
            for position in valid_positions:
                supplement_tokenized[
                    position] = supplement_inflections_by_position[position][0]

            # form text and eval
            supp_text = detokenizer.detokenize(supplement_tokenized)
            supp_loss, supp_predicted = self.get_loss(supp_text, question_dict,
                                                      context)
            num_queries += 1

            if supp_loss > max_loss:
                max_loss = supp_loss
                max_predicted = supp_predicted
                perturbed_tokenized = supplement_tokenized

        return perturbed_tokenized, max_loss, max_predicted, num_queries
    def forward(
            self,  # type: ignore
            question: Dict[str, torch.LongTensor],
            passage: Dict[str, torch.LongTensor],
            span_start: torch.IntTensor = None,
            span_end: torch.IntTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        batch_size, num_of_passage_tokens = passage['bert'].size()

        # BERT for QA is a fully connected linear layer on top of BERT producing 2 vectors of
        # start and end spans.
        embedded_passage = self._text_field_embedder(passage)
        passage_length = embedded_passage.size(1)
        logits = self.qa_outputs(embedded_passage)
        start_logits, end_logits = logits.split(1, dim=-1)
        span_start_logits = start_logits.squeeze(-1)
        span_end_logits = end_logits.squeeze(-1)

        # Adding some masks with numerically stable values
        passage_mask = util.get_text_field_mask(passage).float()
        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(1, 1, 1)
        repeated_passage_mask = repeated_passage_mask.view(
            batch_size, passage_length)
        span_start_logits = util.replace_masked_values(span_start_logits,
                                                       repeated_passage_mask,
                                                       -1e7)
        span_end_logits = util.replace_masked_values(span_end_logits,
                                                     repeated_passage_mask,
                                                     -1e7)

        output_dict: Dict[str, Any] = {}
        # add span start and end logits for knowledge distillation
        output_dict: Dict[str, Any] = {
            "span_start_logits": span_start_logits,
            "span_end_logits": span_end_logits,
        }

        # We may have multiple instances per questions, moving to per-question
        intances_question_id = [
            insta_meta['question_id'] for insta_meta in metadata
        ]
        question_instances_split_inds = np.cumsum(
            np.unique(intances_question_id, return_counts=True)[1])[:-1]
        per_question_inds = np.split(range(batch_size),
                                     question_instances_split_inds)
        metadata = np.split(metadata, question_instances_split_inds)

        # Compute the loss.
        # if span_start is not None and len(np.argwhere(span_start.squeeze().cpu() >= 0)) > 0:
        if span_start is not None and len(
                np.argwhere(
                    span_start.squeeze(-1).squeeze(-1).cpu() >= 0)) > 0:
            # in evaluation some instances may not contain the gold answer, so we need to compute
            # loss only on those that do.
            inds_with_gold_answer = np.argwhere(
                span_start.view(-1).cpu().numpy() >= 0)
            inds_with_gold_answer = inds_with_gold_answer.squeeze(
            ) if len(inds_with_gold_answer) > 1 else inds_with_gold_answer
            if len(inds_with_gold_answer) > 0:
                loss = nll_loss(util.masked_log_softmax(span_start_logits[inds_with_gold_answer], \
                                                    repeated_passage_mask[inds_with_gold_answer]),\
                                span_start.view(-1)[inds_with_gold_answer], ignore_index=-1)
                output_dict["loss_start"] = loss
                loss += nll_loss(util.masked_log_softmax(span_end_logits[inds_with_gold_answer], \
                                                    repeated_passage_mask[inds_with_gold_answer]),\
                                span_end.view(-1)[inds_with_gold_answer], ignore_index=-1)
                output_dict["loss"] = loss
                output_dict["loss_end"] = loss - output_dict["loss_start"]

        # This is a hack for cases in which gold answer is not provided so we cannot compute loss...
        if 'loss' not in output_dict:
            output_dict["loss"] = torch.cuda.FloatTensor([0], device=span_end_logits.device) \
                if torch.cuda.is_available() else torch.FloatTensor([0])

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict["start_bias_weight"] = []
        output_dict["end_bias_weight"] = []

        # getting best span prediction for
        best_span = self._get_example_predications(span_start_logits,
                                                   span_end_logits,
                                                   self._max_span_length)
        best_span_cpu = best_span.detach().cpu().numpy()

        span_start_logits_numpy = span_start_logits.data.cpu().numpy()
        span_end_logits_numpy = span_end_logits.data.cpu().numpy()
        # Iterating over every question (which may contain multiple instances, one per chunk)
        for question_inds, question_instances_metadata in zip(
                per_question_inds, metadata):
            best_span_ind = np.argmax(
                span_start_logits_numpy[question_inds,
                                        best_span_cpu[question_inds][:, 0]] +
                span_end_logits_numpy[question_inds,
                                      best_span_cpu[question_inds][:, 1]])
            best_span_logit = np.max(
                span_start_logits_numpy[question_inds,
                                        best_span_cpu[question_inds][:, 0]] +
                span_end_logits_numpy[question_inds,
                                      best_span_cpu[question_inds][:, 1]])

            passage_str = question_instances_metadata[best_span_ind][
                'original_passage']
            offsets = question_instances_metadata[best_span_ind][
                'token_offsets']

            predicted_span = best_span_cpu[question_inds[best_span_ind]]
            start_offset = offsets[predicted_span[0]][0]
            end_offset = offsets[predicted_span[1]][1]
            best_span_string = passage_str[start_offset:end_offset]

            # Note: this is a hack, because AllenNLP, when predicting, expects a value for each instance.
            # But we may have more than 1 chunk per question, and thus less output strings than instances
            for i in range(len(question_inds)):
                output_dict['best_span_str'].append(best_span_string)
                output_dict['qid'].append(
                    question_instances_metadata[best_span_ind]['question_id'])

                # get the scalar logit value of the predicted span start and end index as bias weight.
                output_dict["start_bias_weight"].append(
                    util.masked_softmax(span_start_logits[best_span_ind],
                                        repeated_passage_mask[best_span_ind])[
                                            best_span_cpu[best_span_ind][0]])
                output_dict["end_bias_weight"].append(
                    util.masked_softmax(span_end_logits[best_span_ind],
                                        repeated_passage_mask[best_span_ind])[
                                            best_span_cpu[best_span_ind][1]])

            f1_score = 0.0
            EM_score = 0.0
            gold_answer_texts = question_instances_metadata[best_span_ind][
                'answer_texts_list']
            if gold_answer_texts:
                f1_score = squad_eval.metric_max_over_ground_truths(
                    squad_eval.f1_score, best_span_string, gold_answer_texts)
                EM_score = squad_eval.metric_max_over_ground_truths(
                    squad_eval.exact_match_score, best_span_string,
                    gold_answer_texts)
            self._official_f1(100 * f1_score)
            self._official_EM(100 * EM_score)

            # TODO move to predict
            if self._predictions_file is not None:
                with open(self._predictions_file, 'a') as f:
                    f.write(json.dumps({'question_id':question_instances_metadata[best_span_ind]['question_id'], \
                                'best_span_logit':float(best_span_logit), \
                                'f1':100 * f1_score,
                                'EM':100 * EM_score,
                                'best_span_string':best_span_string,\
                                'gold_answer_texts':gold_answer_texts, \
                                'qas_used_fraction':1.0}) + '\n')

        return output_dict
Exemple #14
0
    def forward(self,  # type: ignore
                question: Dict[str, torch.LongTensor],
                passage: Dict[str, torch.LongTensor],
                span_starts: torch.IntTensor = None,
                span_ends: torch.IntTensor = None,
                yesno_labels : torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        batch_size, num_of_passage_tokens = passage['bert'].size()

        # Executing the BERT model on the word piece ids (input_ids)
        input_ids = passage['bert']
        token_type_ids = torch.zeros_like(input_ids)
        mask = (input_ids != 0).long()
        embedded_chunk, pooled_output = \
            self._text_field_embedder.token_embedder_bert.bert_model(input_ids=util.combine_initial_dims(input_ids),
                                                         token_type_ids=util.combine_initial_dims(token_type_ids),
                                                         attention_mask=util.combine_initial_dims(mask),
                                                         output_all_encoded_layers=False)

        # Just measuring some lengths and offsets to handle the converstion between tokens and word-pieces
        passage_length = embedded_chunk.size(1)
        mask_min_values, wordpiece_passage_lens = torch.min(mask, dim=1)
        wordpiece_passage_lens[mask_min_values == 1] = mask.shape[1]
        offset_min_values, token_passage_lens = torch.min(passage['bert-offsets'], dim=1)
        token_passage_lens[offset_min_values != 0] = passage['bert-offsets'].shape[1]
        bert_offsets = passage['bert-offsets'].cpu().numpy()

        # BERT for QA is a fully connected linear layer on top of BERT producing 2 vectors of
        # start and end spans.
        logits = self.qa_outputs(embedded_chunk)
        start_logits, end_logits = logits.split(1, dim=-1)
        span_start_logits = start_logits.squeeze(-1)
        span_end_logits = end_logits.squeeze(-1)

        # all input is preprocessed before farword is run, counting the yesno vocabulary
        # will indicate if yesno support is at all needed.
        if self.vocab.get_vocab_size("yesno_labels") > 1:
            yesno_logits = self.qa_yesno(torch.max(embedded_chunk, 1)[0])

        span_starts.clamp_(0, passage_length)
        span_ends.clamp_(0, passage_length)

        # moving to word piece indexes from token indexes of start and end span
        span_starts_list = [bert_offsets[i, span_starts[i]] if span_starts[i] != 0 else 0 for i in range(batch_size)]
        span_ends_list = [bert_offsets[i, span_ends[i]] if span_ends[i] != 0 else 0 for i in range(batch_size)]
        span_starts = torch.cuda.LongTensor(span_starts_list, device=span_end_logits.device) \
            if torch.cuda.is_available() else torch.LongTensor(span_starts_list)
        span_ends = torch.cuda.LongTensor(span_ends_list, device=span_end_logits.device) \
            if torch.cuda.is_available() else torch.LongTensor(span_ends_list)

        loss_fct = CrossEntropyLoss(ignore_index=passage_length)
        start_loss = loss_fct(start_logits.squeeze(-1), span_starts)
        end_loss = loss_fct(end_logits.squeeze(-1), span_ends)

        if self.vocab.get_vocab_size("yesno_labels") > 1 and yesno_labels is not None:
            yesno_loss = loss_fct(yesno_logits, yesno_labels)
            loss = (start_loss + end_loss + yesno_loss) / 3
        else:
            loss = (start_loss + end_loss) / 2

        output_dict: Dict[str, Any] = {}
        if loss == 0:
            # For evaluation purposes only!
            output_dict["loss"] = torch.cuda.FloatTensor([0], device=span_end_logits.device) \
                if torch.cuda.is_available() else torch.FloatTensor([0])
        else:
            output_dict["loss"] = loss

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['best_span_logit'] = []
        output_dict['cannot_answer_logit'] = []
        output_dict['yesno'] = []
        output_dict['yesno_logit'] = []
        output_dict['qid'] = []
        if span_starts is not None:
            output_dict['EM'] = []
            output_dict['f1'] = []

        # getting best span prediction for
        best_span = self._get_example_predications(span_start_logits, span_end_logits, self._max_span_length)
        best_span_cpu = best_span.detach().cpu().numpy()

        for instance_ind, instance_metadata in zip(range(batch_size), metadata):
            best_span_logit = span_start_logits.data.cpu().numpy()[instance_ind, best_span_cpu[instance_ind][0]] + \
                              span_end_logits.data.cpu().numpy()[instance_ind, best_span_cpu[instance_ind][1]]
            cannot_answer_logit = span_start_logits.data.cpu().numpy()[instance_ind, 0] + \
                              span_end_logits.data.cpu().numpy()[instance_ind, 0]

            if self.vocab.get_vocab_size("yesno_labels") > 1:
                yesno_maxind = np.argmax(yesno_logits[instance_ind].data.cpu().numpy())
                yesno_logit = yesno_logits[instance_ind, yesno_maxind].data.cpu().numpy()
                yesno_pred = self.vocab.get_token_from_index(yesno_maxind, namespace="yesno_labels")
            else:
                yesno_pred = 'no_yesno'
                yesno_logit = -30.0

            passage_str = instance_metadata['original_passage']
            offsets = instance_metadata['token_offsets']

            predicted_span = best_span_cpu[instance_ind]
            # In this version yesno if not "no_yesno" will be regarded as final answer before the spans are considered.
            if yesno_pred != 'no_yesno':
                best_span_string = yesno_pred
            else:
                if cannot_answer_logit + 0.9 > best_span_logit :
                    best_span_string = 'cannot_answer'
                else:
                    wordpiece_offsets = self.bert_offsets_to_wordpiece_offsets(bert_offsets[instance_ind][0:len(offsets)])
                    start_offset = offsets[wordpiece_offsets[predicted_span[0] if predicted_span[0] < len(wordpiece_offsets) \
                        else len(wordpiece_offsets)-1]][0]
                    end_offset = offsets[wordpiece_offsets[predicted_span[1] if predicted_span[1] < len(wordpiece_offsets) \
                        else len(wordpiece_offsets)-1]][1]
                    best_span_string = passage_str[start_offset:end_offset]

            output_dict['best_span_str'].append(best_span_string)
            output_dict['cannot_answer_logit'].append(cannot_answer_logit)
            output_dict['best_span_logit'].append(best_span_logit)
            output_dict['yesno'].append(yesno_pred)
            output_dict['yesno_logit'].append(yesno_logit)
            output_dict['qid'].append(instance_metadata['question_id'])

            # In AllenNLP prediction mode we have no gold answers, so let's check
            if span_starts is not None:
                yesno_label_ind = yesno_labels.data.cpu().numpy()[instance_ind]
                yesno_label = self.vocab.get_token_from_index(yesno_label_ind, namespace="yesno_labels")

                if yesno_label != 'no_yesno':
                    gold_answer_texts = [yesno_label]
                elif instance_metadata['cannot_answer']:
                    gold_answer_texts = ['cannot_answer']
                else:
                    gold_answer_texts = instance_metadata['answer_texts_list']

                f1_score = squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, best_span_string, gold_answer_texts)
                EM_score = squad_eval.metric_max_over_ground_truths(squad_eval.exact_match_score, best_span_string, gold_answer_texts)
                self._official_f1(100 * f1_score)
                self._official_EM(100 * EM_score)
                output_dict['EM'].append(100 * EM_score)
                output_dict['f1'].append(100 * f1_score)


        return output_dict
Exemple #15
0
def predict(args):
    file_path = cached_path(args.model)
    archive = load_archive(file_path, cuda_device=args.cuda_device)
    predictor = Predictor.from_archive(archive, 'multiqa_predictor')
    all_predictions = {}
    all_full_predictions = []
    contexts = []
    single_file_path_cached = cached_path(args.dataset)
    with gzip.open(single_file_path_cached, 'rb') as myzip:
        for example in myzip:
            context = json.loads(example)
            if 'header' in context:
                continue
            contexts.append(context)

            if args.sample_size != -1 and \
                    sum([len(context['qas']) for context in contexts]) >= args.sample_size:
                break

    # predict
    answers = {}
    all_scores = {}
    for context in Tqdm.tqdm(contexts, total=len(contexts)):
        curr_pred, full_predictions = predictor.predict_json(context)
        all_predictions.update(curr_pred)
        all_full_predictions += full_predictions

        # saving official answers for this context
        for qa in context['qas']:
            qid = qa['qid'].split('_q_')[1]
            if qid not in answers:
                answers[qid] = []

            if 'annotators_answer_candidates' in qa['answers']['open-ended']:
                for ans_cand in qa['answers']['open-ended'][
                        'annotators_answer_candidates']:
                    if 'single_answer' in ans_cand and 'extractive' in ans_cand[
                            'single_answer']:
                        answers[qid] += [
                            (ans_cand['single_answer']['extractive']['answer'])
                        ]
                        if 'aliases' in ans_cand['single_answer'][
                                'extractive']:
                            answers[qid] += ans_cand['extractive'][
                                'single_answer']['aliases']
                    elif 'single_answer' in ans_cand and 'yesno' in ans_cand[
                            'single_answer']:
                        answers[qid] += [(ans_cand['single_answer']['yesno'])]

            elif 'cannot_answer' in qa['answers']['open-ended']:
                answers[qid] += ['cannot_answer']

            f1_score = squad_eval.metric_max_over_ground_truths(
                squad_eval.f1_score, all_predictions[qid], answers[qid])
            EM_score = squad_eval.metric_max_over_ground_truths(
                squad_eval.exact_match_score, all_predictions[qid],
                answers[qid])
            all_scores[qid] = {'EM': EM_score * 100, 'f1': f1_score * 100}

    metrics = {}
    metrics['EM'] = sum([all_scores[q]['EM'] for q in all_scores.keys()]) / \
                    len(all_scores.keys())
    metrics['f1'] = sum([all_scores[q]['f1'] for q in all_scores.keys()]) / \
                    len(all_scores.keys())
    print(json.dumps(metrics))

    # running the official evaluation script:
    metrics = evaluate(answers, all_predictions, True)
    print(json.dumps(metrics))

    # automatic filename generation / or manual
    if args.prediction_filepath == None:
        if not os.path.exists('results/' + args.dataset_name):
            os.makedirs('results/' + args.dataset_name)
        output_filepath = 'results/' + args.dataset_name + '/' + '_'.join(args.model.split('/')[-2:]).split('.')[0] + '__on__' + \
                          args.dataset.split('/')[-1].split('.')[0]
    else:
        output_filepath = args.output_filepath

    # formatting the predictions in the specific dataset format in order to run the official eval_script
    factory = MultiQAFactory()
    all_predictions = factory.format_predictions(args.dataset_name,
                                                 all_predictions)

    # running dataset specific eval script

    # saving predictions
    with open(output_filepath + '_predictions.json', 'w') as f:
        json.dump(all_predictions, f)
    with open(output_filepath + '_fullpredictions.json', 'w') as f:
        json.dump(all_full_predictions, f)

    # storing results
    with open(output_filepath + '_eval_results.json', 'w') as f:
        json.dump(metrics, f)
Exemple #16
0
                            'extractive']:
                        answers[qid] += [
                            (ans_cand['extractive']['single_answer']['answer'])
                        ]
                        if 'aliases' in ans_cand['extractive'][
                                'single_answer']:
                            answers[qid] += ans_cand['extractive'][
                                'single_answer']['aliases']
                    elif 'yesno' in ans_cand and 'single_answer' in ans_cand[
                            'yesno']:
                        answers[qid] += [(ans_cand['yesno']['single_answer'])]

            elif 'cannot_answer' in qa['answers']['open-ended']:
                answers[qid] += ['cannot_answer']

            f1_score = squad_eval.metric_max_over_ground_truths(
                squad_eval.f1_score, all_predictions[qid], answers[qid])
            EM_score = squad_eval.metric_max_over_ground_truths(
                squad_eval.exact_match_score, all_predictions[qid],
                answers[qid])
            all_scores[qid] = {'EM': EM_score * 100, 'f1': f1_score * 100}

    metrics = {}
    metrics['EM'] = sum([all_scores[q]['EM'] for q in all_scores.keys()]) / \
                    len(all_scores.keys())
    metrics['f1'] = sum([all_scores[q]['f1'] for q in all_scores.keys()]) / \
                    len(all_scores.keys())
    print(json.dumps(metrics))

    # running the official evaluation script:
    metrics = evaluate(answers, all_predictions, True)
    print(json.dumps(metrics))
Exemple #17
0
    def forward(
            self,  # type: ignore
            question: Dict[str, torch.LongTensor],
            passage: Dict[str, torch.LongTensor],
            span_start: torch.IntTensor = None,
            span_end: torch.IntTensor = None,
            p1_answer_marker: torch.IntTensor = None,
            p2_answer_marker: torch.IntTensor = None,
            p3_answer_marker: torch.IntTensor = None,
            yesno_list: torch.IntTensor = None,
            followup_list: torch.IntTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        p1_answer_marker : ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 0.
            This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length].
            Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer
            in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>.
            For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac
        p2_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 1.
            It is similar to p1_answer_marker, but marking previous previous answer in passage.
        p3_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 2.
            It is similar to p1_answer_marker, but marking previous previous previous answer in passage.
        yesno_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (the yes/no/not a yes no question).
        followup_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (followup / maybe followup / don't followup).
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of the followings.
        Each of the followings is a nested list because first iterates over dialog, then questions in dialog.

        qid : List[List[str]]
            A list of list, consisting of question ids.
        followup : List[List[int]]
            A list of list, consisting of continuation marker prediction index.
            (y :yes, m: maybe follow up, n: don't follow up)
        yesno : List[List[int]]
            A list of list, consisting of affirmation marker prediction index.
            (y :yes, x: not a yes/no question, n: np)
        best_span_str : List[List[str]]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        batch_size, max_qa_count, max_q_len, _ = question[
            'token_characters'].size()
        total_qa_count = batch_size * max_qa_count
        qa_mask = torch.ge(followup_list, 0).view(total_qa_count)
        embedded_question = self._text_field_embedder(question,
                                                      num_wrapping_dims=1)
        embedded_question = embedded_question.reshape(
            total_qa_count, max_q_len,
            self._text_field_embedder.get_output_dim())
        embedded_question = self._variational_dropout(embedded_question)
        embedded_passage = self._variational_dropout(
            self._text_field_embedder(passage))
        passage_length = embedded_passage.size(1)

        question_mask = util.get_text_field_mask(question,
                                                 num_wrapping_dims=1).float()
        question_mask = question_mask.reshape(total_qa_count, max_q_len)
        passage_mask = util.get_text_field_mask(passage).float()

        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(
            1, max_qa_count, 1)
        repeated_passage_mask = repeated_passage_mask.view(
            total_qa_count, passage_length)

        if self._num_context_answers > 0:
            # Encode question turn number inside the dialog into question embedding.
            question_num_ind = util.get_range_vector(
                max_qa_count, util.get_device_of(embedded_question))
            question_num_ind = question_num_ind.unsqueeze(-1).repeat(
                1, max_q_len)
            question_num_ind = question_num_ind.unsqueeze(0).repeat(
                batch_size, 1, 1)
            question_num_ind = question_num_ind.reshape(
                total_qa_count, max_q_len)
            question_num_marker_emb = self._question_num_marker(
                question_num_ind)
            embedded_question = torch.cat(
                [embedded_question, question_num_marker_emb], dim=-1)

            # Encode the previous answers in passage embedding.
            repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \
                view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim())
            # batch_size * max_qa_count, passage_length, word_embed_dim
            p1_answer_marker = p1_answer_marker.view(total_qa_count,
                                                     passage_length)
            p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker)
            repeated_embedded_passage = torch.cat(
                [repeated_embedded_passage, p1_answer_marker_emb], dim=-1)
            if self._num_context_answers > 1:
                p2_answer_marker = p2_answer_marker.view(
                    total_qa_count, passage_length)
                p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker)
                repeated_embedded_passage = torch.cat(
                    [repeated_embedded_passage, p2_answer_marker_emb], dim=-1)
                if self._num_context_answers > 2:
                    p3_answer_marker = p3_answer_marker.view(
                        total_qa_count, passage_length)
                    p3_answer_marker_emb = self._prev_ans_marker(
                        p3_answer_marker)
                    repeated_embedded_passage = torch.cat(
                        [repeated_embedded_passage, p3_answer_marker_emb],
                        dim=-1)

            repeated_encoded_passage = self._variational_dropout(
                self._phrase_layer(repeated_embedded_passage,
                                   repeated_passage_mask))
        else:
            encoded_passage = self._variational_dropout(
                self._phrase_layer(embedded_passage, passage_mask))
            repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(
                1, max_qa_count, 1, 1)
            repeated_encoded_passage = repeated_encoded_passage.view(
                total_qa_count, passage_length, self._encoding_dim)

        encoded_question = self._variational_dropout(
            self._phrase_layer(embedded_question, question_mask))

        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(
            repeated_encoded_passage, encoded_question)
        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_attention = util.masked_softmax(
            passage_question_similarity, question_mask)
        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(
            encoded_question, passage_question_attention)

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        masked_similarity = util.replace_masked_values(
            passage_question_similarity, question_mask.unsqueeze(1), -1e7)

        question_passage_similarity = masked_similarity.max(
            dim=-1)[0].squeeze(-1)
        question_passage_attention = util.masked_softmax(
            question_passage_similarity, repeated_passage_mask)
        # Shape: (batch_size * max_qa_count, encoding_dim)
        question_passage_vector = util.weighted_sum(
            repeated_encoded_passage, question_passage_attention)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(
            1).expand(total_qa_count, passage_length, self._encoding_dim)

        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([
            repeated_encoded_passage, passage_question_vectors,
            repeated_encoded_passage * passage_question_vectors,
            repeated_encoded_passage * tiled_question_passage_vector
        ],
                                         dim=-1)

        final_merged_passage = F.relu(self._merge_atten(final_merged_passage))

        residual_layer = self._variational_dropout(
            self._residual_encoder(final_merged_passage,
                                   repeated_passage_mask))
        self_attention_matrix = self._self_attention(residual_layer,
                                                     residual_layer)

        mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \
               * repeated_passage_mask.reshape(total_qa_count, 1, passage_length)
        self_mask = torch.eye(passage_length,
                              passage_length,
                              device=self_attention_matrix.device)
        self_mask = self_mask.reshape(1, passage_length, passage_length)
        mask = mask * (1 - self_mask)

        self_attention_probs = util.masked_softmax(self_attention_matrix, mask)

        # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim)
        self_attention_vecs = torch.matmul(self_attention_probs,
                                           residual_layer)
        self_attention_vecs = torch.cat([
            self_attention_vecs, residual_layer,
            residual_layer * self_attention_vecs
        ],
                                        dim=-1)
        residual_layer = F.relu(
            self._merge_self_attention(self_attention_vecs))

        final_merged_passage = final_merged_passage + residual_layer
        # batch_size * maxqa_pair_len * max_passage_len * 200
        final_merged_passage = self._variational_dropout(final_merged_passage)
        start_rep = self._span_start_encoder(final_merged_passage,
                                             repeated_passage_mask)
        span_start_logits = self._span_start_predictor(start_rep).squeeze(-1)

        end_rep = self._span_end_encoder(
            torch.cat([final_merged_passage, start_rep], dim=-1),
            repeated_passage_mask)
        span_end_logits = self._span_end_predictor(end_rep).squeeze(-1)

        span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1)
        span_followup_logits = self._span_followup_predictor(end_rep).squeeze(
            -1)

        span_start_logits = util.replace_masked_values(span_start_logits,
                                                       repeated_passage_mask,
                                                       -1e7)
        # batch_size * maxqa_len_pair, max_document_len
        span_end_logits = util.replace_masked_values(span_end_logits,
                                                     repeated_passage_mask,
                                                     -1e7)

        best_span = self._get_best_span_yesno_followup(span_start_logits,
                                                       span_end_logits,
                                                       span_yesno_logits,
                                                       span_followup_logits,
                                                       self._max_span_length)

        output_dict: Dict[str, Any] = {}

        # Compute the loss.
        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits,
                                                    repeated_passage_mask),
                            span_start.view(-1),
                            ignore_index=-1)
            self._span_start_accuracy(span_start_logits,
                                      span_start.view(-1),
                                      mask=qa_mask)
            loss += nll_loss(util.masked_log_softmax(span_end_logits,
                                                     repeated_passage_mask),
                             span_end.view(-1),
                             ignore_index=-1)
            self._span_end_accuracy(span_end_logits,
                                    span_end.view(-1),
                                    mask=qa_mask)
            self._span_accuracy(best_span[:, 0:2],
                                torch.stack([span_start, span_end],
                                            -1).view(total_qa_count, 2),
                                mask=qa_mask.unsqueeze(1).expand(-1, 2).long())
            # add a select for the right span to compute loss
            gold_span_end_loc = []
            span_end = span_end.view(
                total_qa_count).squeeze().data.cpu().numpy()
            for i in range(0, total_qa_count):
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 1, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 2, 0))
            gold_span_end_loc = span_start.new(gold_span_end_loc)

            pred_span_end_loc = []
            for i in range(0, total_qa_count):
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0))
            predicted_end = span_start.new(pred_span_end_loc)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, gold_span_end_loc).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(
                0, gold_span_end_loc).view(-1, 3)
            loss += nll_loss(F.log_softmax(_yesno, dim=-1),
                             yesno_list.view(-1),
                             ignore_index=-1)
            loss += nll_loss(F.log_softmax(_followup, dim=-1),
                             followup_list.view(-1),
                             ignore_index=-1)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, predicted_end).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(
                0, predicted_end).view(-1, 3)
            self._span_yesno_accuracy(_yesno,
                                      yesno_list.view(-1),
                                      mask=qa_mask)
            self._span_followup_accuracy(_followup,
                                         followup_list.view(-1),
                                         mask=qa_mask)
            output_dict["loss"] = loss

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict['followup'] = []
        output_dict['yesno'] = []
        best_span_cpu = best_span.detach().cpu().numpy()
        for i in range(batch_size):
            passage_str = metadata[i]['original_passage']
            offsets = metadata[i]['token_offsets']
            f1_score = 0.0
            per_dialog_best_span_list = []
            per_dialog_yesno_list = []
            per_dialog_followup_list = []
            per_dialog_query_id_list = []
            for per_dialog_query_index, (iid, answer_texts) in enumerate(
                    zip(metadata[i]["instance_id"],
                        metadata[i]["answer_texts_list"])):
                predicted_span = tuple(best_span_cpu[i * max_qa_count +
                                                     per_dialog_query_index])

                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]

                yesno_pred = predicted_span[2]
                followup_pred = predicted_span[3]
                per_dialog_yesno_list.append(yesno_pred)
                per_dialog_followup_list.append(followup_pred)
                per_dialog_query_id_list.append(iid)

                best_span_string = passage_str[start_offset:end_offset]
                per_dialog_best_span_list.append(best_span_string)
                if answer_texts:
                    if len(answer_texts) > 1:
                        t_f1 = []
                        # Compute F1 over N-1 human references and averages the scores.
                        for answer_index in range(len(answer_texts)):
                            idxes = list(range(len(answer_texts)))
                            idxes.pop(answer_index)
                            refs = [answer_texts[z] for z in idxes]
                            t_f1.append(
                                squad_eval.metric_max_over_ground_truths(
                                    squad_eval.f1_score, best_span_string,
                                    refs))
                        f1_score = 1.0 * sum(t_f1) / len(t_f1)
                    else:
                        f1_score = squad_eval.metric_max_over_ground_truths(
                            squad_eval.f1_score, best_span_string,
                            answer_texts)
                self._official_f1(100 * f1_score)
            output_dict['qid'].append(per_dialog_query_id_list)
            output_dict['best_span_str'].append(per_dialog_best_span_list)
            output_dict['yesno'].append(per_dialog_yesno_list)
            output_dict['followup'].append(per_dialog_followup_list)
        return output_dict
Exemple #18
0
    def forward(
            self,
            question: Dict[str, torch.LongTensor],
            passage: Dict[str, torch.LongTensor],
            span_start: torch.IntTensor = None,
            span_end: torch.IntTensor = None,
            yesno_list: torch.IntTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        batch_size, max_qa_count, max_q_len, _ = question[
            'token_characters'].size()
        total_qa_count = batch_size * max_qa_count
        qa_mask = torch.ge(yesno_list, 0).view(total_qa_count)

        embedded_question = self._text_field_embedder(question,
                                                      num_wrapping_dims=1)
        # total_qa_count * max_q_len * encoding_dim
        embedded_question = embedded_question.reshape(
            total_qa_count, max_q_len,
            self._text_field_embedder.get_output_dim())
        embedded_passage = self._text_field_embedder(passage)

        # split the embedded tensors to get the word embedding and char embedding, elmo embedding and features embedding
        word_emb_ques, elmo_ques, ques_feat = torch.split(embedded_question,
                                                          [200, 1024, 40],
                                                          dim=2)
        word_emb_pass, elmo_pass, pass_feat = torch.split(embedded_passage,
                                                          [200, 1024, 40],
                                                          dim=2)
        # word embedding and char embedding
        embedded_question = self._variational_dropout(
            torch.cat([word_emb_ques, elmo_ques], dim=2))
        embedded_passage = self._variational_dropout(
            torch.cat([word_emb_pass, elmo_pass], dim=2))
        passage_length = embedded_passage.size(1)

        question_mask = util.get_text_field_mask(question,
                                                 num_wrapping_dims=1).float()
        question_mask = question_mask.reshape(total_qa_count, max_q_len)
        passage_mask = util.get_text_field_mask(passage).float()

        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(
            1, max_qa_count, 1)
        repeated_passage_mask = repeated_passage_mask.view(
            total_qa_count, passage_length)

        encode_passage = self._phrase_layer(embedded_passage, passage_mask)
        projected_passage = self.relu(
            self.projected_layer(torch.cat([encode_passage, elmo_pass],
                                           dim=2)))

        encode_question = self._phrase_layer(embedded_question, question_mask)
        projected_question = self.relu(
            self.projected_layer(torch.cat([encode_question, elmo_ques],
                                           dim=2)))

        encoded_passage = self._variational_dropout(projected_passage)
        repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(
            1, max_qa_count, 1, 1)
        repeated_encoded_passage = repeated_encoded_passage.view(
            total_qa_count, passage_length, self._encoding_dim)
        repeated_pass_feat = (pass_feat.unsqueeze(1).repeat(
            1, max_qa_count, 1, 1)).view(total_qa_count, passage_length, 40)
        encoded_question = self._variational_dropout(projected_question)

        # total_qa_count * max_q_len * passage_length
        # cnt * m * n
        s = torch.bmm(encoded_question,
                      repeated_encoded_passage.transpose(2, 1))
        alpha = util.masked_softmax(s,
                                    question_mask.unsqueeze(2).expand(
                                        s.size()),
                                    dim=1)
        # cnt * n * h
        aligned_p = torch.bmm(alpha.transpose(2, 1), encoded_question)

        # cnt * m * n
        beta = util.masked_softmax(s,
                                   repeated_passage_mask.unsqueeze(1).expand(
                                       s.size()),
                                   dim=2)
        # cnt * m * h
        aligned_q = torch.bmm(beta, repeated_encoded_passage)

        fused_p = self.fuse_p(repeated_encoded_passage, aligned_p)
        fused_q = self.fuse_q(encoded_question, aligned_q)

        # add manual features here
        q_aware_p = self.projected_lstm(
            torch.cat([fused_p, repeated_pass_feat], dim=2),
            repeated_passage_mask)

        # cnt * n * n
        # self_p = torch.bmm(q_aware_p, q_aware_p.transpose(2, 1))
        # self_p = self.bilinear_self_align(q_aware_p)
        self_p = self._self_attention(q_aware_p, q_aware_p)
        # for i in range(passage_length):
        #     self_p[:, i, i] = 0
        mask = repeated_passage_mask.reshape(
            total_qa_count, passage_length, 1) * repeated_passage_mask.reshape(
                total_qa_count, 1, passage_length)
        self_mask = torch.eye(passage_length,
                              passage_length,
                              device=self_p.device)
        self_mask = self_mask.reshape(1, passage_length, passage_length)
        mask = mask * (1 - self_mask)

        lamb = util.masked_softmax(self_p, mask, dim=2)
        # lamb = util.masked_softmax(self_p, repeated_passage_mask, dim=2)
        # cnt * n * h
        self_aligned_p = torch.bmm(lamb, q_aware_p)

        # cnt * n * h
        fused_self_p = self.fuse_s(q_aware_p, self_aligned_p)
        # contextual_p = self._variational_dropout(self.contextual_layer_p(fused_self_p, repeated_passage_mask))
        contextual_p = self.contextual_layer_p(fused_self_p,
                                               repeated_passage_mask)

        # contextual_q = self._variational_dropout(self.contextual_layer_q(fused_q, question_mask))
        contextual_q = self.contextual_layer_q(fused_q, question_mask)
        # cnt * m
        gamma = util.masked_softmax(
            self.linear_self_align(contextual_q).squeeze(2),
            question_mask,
            dim=1)
        # cnt * h
        weighted_q = torch.bmm(gamma.unsqueeze(1), contextual_q).squeeze(1)

        span_start_logits = self.bilinear_layer_s(weighted_q, contextual_p)
        span_end_logits = self.bilinear_layer_e(weighted_q, contextual_p)

        # cnt * n * 1  cnt * 1 * h
        span_yesno_logits = self.yesno_predictor(
            torch.bmm(span_end_logits.unsqueeze(2), weighted_q.unsqueeze(1)))
        # span_yesno_logits = self.yesno_predictor(contextual_p)

        span_start_logits = util.replace_masked_values(span_start_logits,
                                                       repeated_passage_mask,
                                                       -1e7)
        span_end_logits = util.replace_masked_values(span_end_logits,
                                                     repeated_passage_mask,
                                                     -1e7)

        best_span = self._get_best_span_yesno_followup(span_start_logits,
                                                       span_end_logits,
                                                       span_yesno_logits,
                                                       self._max_span_length)

        output_dict: Dict[str, Any] = {}

        # Compute the loss for training

        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits,
                                                    repeated_passage_mask),
                            span_start.view(-1),
                            ignore_index=-1)
            self._span_start_accuracy(span_start_logits,
                                      span_start.view(-1),
                                      mask=qa_mask)
            loss += nll_loss(util.masked_log_softmax(span_end_logits,
                                                     repeated_passage_mask),
                             span_end.view(-1),
                             ignore_index=-1)
            self._span_end_accuracy(span_end_logits,
                                    span_end.view(-1),
                                    mask=qa_mask)
            self._span_accuracy(best_span[:, 0:2],
                                torch.stack([span_start, span_end],
                                            -1).view(total_qa_count, 2),
                                mask=qa_mask.unsqueeze(1).expand(-1, 2).long())
            # add a select for the right span to compute loss
            gold_span_end_loc = []
            span_end = span_end.view(
                total_qa_count).squeeze().data.cpu().numpy()
            for i in range(0, total_qa_count):
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 1, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 2, 0))
            gold_span_end_loc = span_start.new(gold_span_end_loc)
            pred_span_end_loc = []
            for i in range(0, total_qa_count):
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0))
            predicted_end = span_start.new(pred_span_end_loc)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, gold_span_end_loc).view(-1, 3)
            loss += nll_loss(torch.nn.functional.log_softmax(_yesno, dim=-1),
                             yesno_list.view(-1),
                             ignore_index=-1)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, predicted_end).view(-1, 3)
            self._span_yesno_accuracy(_yesno,
                                      yesno_list.view(-1),
                                      mask=qa_mask)

            output_dict["loss"] = loss

        # Compute the EM and F1 on SQuAD and add the tokenized input to the output.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict['yesno'] = []
        best_span_cpu = best_span.detach().cpu().numpy()
        for i in range(batch_size):
            passage_str = metadata[i]['original_passage']
            offsets = metadata[i]['token_offsets']
            f1_score = 0.0
            per_dialog_best_span_list = []
            per_dialog_yesno_list = []
            per_dialog_query_id_list = []
            for per_dialog_query_index, (iid, answer_texts) in enumerate(
                    zip(metadata[i]["instance_id"],
                        metadata[i]["answer_texts_list"])):
                predicted_span = tuple(best_span_cpu[i * max_qa_count +
                                                     per_dialog_query_index])
                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]
                yesno_pred = predicted_span[2]
                per_dialog_yesno_list.append(yesno_pred)
                per_dialog_query_id_list.append(iid)
                best_span_string = passage_str[start_offset:end_offset]
                per_dialog_best_span_list.append(best_span_string)
                if answer_texts:
                    if len(answer_texts) > 1:
                        t_f1 = []
                        # Compute F1 over N-1 human references and averages the scores.
                        for answer_index in range(len(answer_texts)):
                            idxes = list(range(len(answer_texts)))
                            idxes.pop(answer_index)
                            refs = [answer_texts[z] for z in idxes]
                            t_f1.append(
                                squad_eval.metric_max_over_ground_truths(
                                    squad_eval.f1_score, best_span_string,
                                    refs))
                        f1_score = 1.0 * sum(t_f1) / len(t_f1)
                    else:
                        f1_score = squad_eval.metric_max_over_ground_truths(
                            squad_eval.f1_score, best_span_string,
                            answer_texts)
                self._official_f1(100 * f1_score)
            output_dict['qid'].append(per_dialog_query_id_list)
            output_dict['best_span_str'].append(per_dialog_best_span_list)
            output_dict['yesno'].append(per_dialog_yesno_list)
        return output_dict
Exemple #19
0
    def forward(self,  # type: ignore
                question: Dict[str, torch.LongTensor],
                passage: Dict[str, torch.LongTensor],
                span_start: torch.IntTensor = None,
                span_end: torch.IntTensor = None,
                p1_answer_marker: torch.IntTensor = None,
                p2_answer_marker: torch.IntTensor = None,
                p3_answer_marker: torch.IntTensor = None,
                yesno_list: torch.IntTensor = None,
                followup_list: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        p1_answer_marker : ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 0.
            This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length].
            Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer
            in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>.
            For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac
        p2_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 1.
            It is similar to p1_answer_marker, but marking previous previous answer in passage.
        p3_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 2.
            It is similar to p1_answer_marker, but marking previous previous previous answer in passage.
        yesno_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (the yes/no/not a yes no question).
        followup_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (followup / maybe followup / don't followup).
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of the followings.
        Each of the followings is a nested list because first iterates over dialog, then questions in dialog.

        qid : List[List[str]]
            A list of list, consisting of question ids.
        followup : List[List[int]]
            A list of list, consisting of continuation marker prediction index.
            (y :yes, m: maybe follow up, n: don't follow up)
        yesno : List[List[int]]
            A list of list, consisting of affirmation marker prediction index.
            (y :yes, x: not a yes/no question, n: np)
        best_span_str : List[List[str]]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        batch_size, max_qa_count, max_q_len, _ = question['token_characters'].size()
        total_qa_count = batch_size * max_qa_count
        qa_mask = torch.ge(followup_list, 0).view(total_qa_count)
        embedded_question = self._text_field_embedder(question, num_wrapping_dims=1)
        embedded_question = embedded_question.reshape(total_qa_count, max_q_len,
                                                      self._text_field_embedder.get_output_dim())
        embedded_question = self._variational_dropout(embedded_question)
        embedded_passage = self._variational_dropout(self._text_field_embedder(passage))
        passage_length = embedded_passage.size(1)

        question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float()
        question_mask = question_mask.reshape(total_qa_count, max_q_len)
        passage_mask = util.get_text_field_mask(passage).float()

        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(1, max_qa_count, 1)
        repeated_passage_mask = repeated_passage_mask.view(total_qa_count, passage_length)

        if self._num_context_answers > 0:
            # Encode question turn number inside the dialog into question embedding.
            question_num_ind = util.get_range_vector(max_qa_count, util.get_device_of(embedded_question))
            question_num_ind = question_num_ind.unsqueeze(-1).repeat(1, max_q_len)
            question_num_ind = question_num_ind.unsqueeze(0).repeat(batch_size, 1, 1)
            question_num_ind = question_num_ind.reshape(total_qa_count, max_q_len)
            question_num_marker_emb = self._question_num_marker(question_num_ind)
            embedded_question = torch.cat([embedded_question, question_num_marker_emb], dim=-1)

            # Encode the previous answers in passage embedding.
            repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \
                view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim())
            # batch_size * max_qa_count, passage_length, word_embed_dim
            p1_answer_marker = p1_answer_marker.view(total_qa_count, passage_length)
            p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker)
            repeated_embedded_passage = torch.cat([repeated_embedded_passage, p1_answer_marker_emb], dim=-1)
            if self._num_context_answers > 1:
                p2_answer_marker = p2_answer_marker.view(total_qa_count, passage_length)
                p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker)
                repeated_embedded_passage = torch.cat([repeated_embedded_passage, p2_answer_marker_emb], dim=-1)
                if self._num_context_answers > 2:
                    p3_answer_marker = p3_answer_marker.view(total_qa_count, passage_length)
                    p3_answer_marker_emb = self._prev_ans_marker(p3_answer_marker)
                    repeated_embedded_passage = torch.cat([repeated_embedded_passage, p3_answer_marker_emb],
                                                          dim=-1)

            repeated_encoded_passage = self._variational_dropout(self._phrase_layer(repeated_embedded_passage,
                                                                                    repeated_passage_mask))
        else:
            encoded_passage = self._variational_dropout(self._phrase_layer(embedded_passage, passage_mask))
            repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1)
            repeated_encoded_passage = repeated_encoded_passage.view(total_qa_count,
                                                                     passage_length,
                                                                     self._encoding_dim)

        encoded_question = self._variational_dropout(self._phrase_layer(embedded_question, question_mask))

        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(repeated_encoded_passage, encoded_question)
        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask)
        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        masked_similarity = util.replace_masked_values(passage_question_similarity,
                                                       question_mask.unsqueeze(1),
                                                       -1e7)

        question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1)
        question_passage_attention = util.masked_softmax(question_passage_similarity, repeated_passage_mask)
        # Shape: (batch_size * max_qa_count, encoding_dim)
        question_passage_vector = util.weighted_sum(repeated_encoded_passage, question_passage_attention)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(total_qa_count,
                                                                                    passage_length,
                                                                                    self._encoding_dim)

        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([repeated_encoded_passage,
                                          passage_question_vectors,
                                          repeated_encoded_passage * passage_question_vectors,
                                          repeated_encoded_passage * tiled_question_passage_vector],
                                         dim=-1)

        final_merged_passage = F.relu(self._merge_atten(final_merged_passage))

        residual_layer = self._variational_dropout(self._residual_encoder(final_merged_passage,
                                                                          repeated_passage_mask))
        self_attention_matrix = self._self_attention(residual_layer, residual_layer)

        mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \
               * repeated_passage_mask.reshape(total_qa_count, 1, passage_length)
        self_mask = torch.eye(passage_length, passage_length, device=self_attention_matrix.device)
        self_mask = self_mask.reshape(1, passage_length, passage_length)
        mask = mask * (1 - self_mask)

        self_attention_probs = util.masked_softmax(self_attention_matrix, mask)

        # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim)
        self_attention_vecs = torch.matmul(self_attention_probs, residual_layer)
        self_attention_vecs = torch.cat([self_attention_vecs, residual_layer,
                                         residual_layer * self_attention_vecs],
                                        dim=-1)
        residual_layer = F.relu(self._merge_self_attention(self_attention_vecs))

        final_merged_passage = final_merged_passage + residual_layer
        # batch_size * maxqa_pair_len * max_passage_len * 200
        final_merged_passage = self._variational_dropout(final_merged_passage)
        start_rep = self._span_start_encoder(final_merged_passage, repeated_passage_mask)
        span_start_logits = self._span_start_predictor(start_rep).squeeze(-1)

        end_rep = self._span_end_encoder(torch.cat([final_merged_passage, start_rep], dim=-1),
                                         repeated_passage_mask)
        span_end_logits = self._span_end_predictor(end_rep).squeeze(-1)

        span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1)
        span_followup_logits = self._span_followup_predictor(end_rep).squeeze(-1)

        span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7)
        # batch_size * maxqa_len_pair, max_document_len
        span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7)

        best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits,
                                                       span_yesno_logits, span_followup_logits,
                                                       self._max_span_length)

        output_dict: Dict[str, Any] = {}

        # Compute the loss.
        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1),
                            ignore_index=-1)
            self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask)
            loss += nll_loss(util.masked_log_softmax(span_end_logits,
                                                     repeated_passage_mask), span_end.view(-1), ignore_index=-1)
            self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask)
            self._span_accuracy(best_span[:, 0:2],
                                torch.stack([span_start, span_end], -1).view(total_qa_count, 2),
                                mask=qa_mask.unsqueeze(1).expand(-1, 2).long())
            # add a select for the right span to compute loss
            gold_span_end_loc = []
            span_end = span_end.view(total_qa_count).squeeze().data.cpu().numpy()
            for i in range(0, total_qa_count):
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3, 0))
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 1, 0))
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 2, 0))
            gold_span_end_loc = span_start.new(gold_span_end_loc)

            pred_span_end_loc = []
            for i in range(0, total_qa_count):
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3, 0))
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0))
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0))
            predicted_end = span_start.new(pred_span_end_loc)

            _yesno = span_yesno_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3)
            loss += nll_loss(F.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1)
            loss += nll_loss(F.log_softmax(_followup, dim=-1), followup_list.view(-1), ignore_index=-1)

            _yesno = span_yesno_logits.view(-1).index_select(0, predicted_end).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(0, predicted_end).view(-1, 3)
            self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask)
            self._span_followup_accuracy(_followup, followup_list.view(-1), mask=qa_mask)
            output_dict["loss"] = loss

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict['followup'] = []
        output_dict['yesno'] = []
        best_span_cpu = best_span.detach().cpu().numpy()
        for i in range(batch_size):
            passage_str = metadata[i]['original_passage']
            offsets = metadata[i]['token_offsets']
            f1_score = 0.0
            per_dialog_best_span_list = []
            per_dialog_yesno_list = []
            per_dialog_followup_list = []
            per_dialog_query_id_list = []
            for per_dialog_query_index, (iid, answer_texts) in enumerate(
                    zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])):
                predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index])

                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]

                yesno_pred = predicted_span[2]
                followup_pred = predicted_span[3]
                per_dialog_yesno_list.append(yesno_pred)
                per_dialog_followup_list.append(followup_pred)
                per_dialog_query_id_list.append(iid)

                best_span_string = passage_str[start_offset:end_offset]
                per_dialog_best_span_list.append(best_span_string)
                if answer_texts:
                    if len(answer_texts) > 1:
                        t_f1 = []
                        # Compute F1 over N-1 human references and averages the scores.
                        for answer_index in range(len(answer_texts)):
                            idxes = list(range(len(answer_texts)))
                            idxes.pop(answer_index)
                            refs = [answer_texts[z] for z in idxes]
                            t_f1.append(squad_eval.metric_max_over_ground_truths(squad_eval.f1_score,
                                                                                 best_span_string,
                                                                                 refs))
                        f1_score = 1.0 * sum(t_f1) / len(t_f1)
                    else:
                        f1_score = squad_eval.metric_max_over_ground_truths(squad_eval.f1_score,
                                                                            best_span_string,
                                                                            answer_texts)
                self._official_f1(100 * f1_score)
            output_dict['qid'].append(per_dialog_query_id_list)
            output_dict['best_span_str'].append(per_dialog_best_span_list)
            output_dict['yesno'].append(per_dialog_yesno_list)
            output_dict['followup'].append(per_dialog_followup_list)
        return output_dict