def qa(targets, predictions): """Computes question answering metrics, maximizing over answers per question. Args: targets: list of lists of strings predictions: list of strings Returns: dict with score_key: squad score across all targets and predictions """ assert len(targets) == len(predictions) targets = [[tf.compat.as_text(t) for t in u] for u in targets] predictions = [tf.compat.as_text(p) for p in predictions] em = np.mean([ squad_eval.metric_max_over_ground_truths( # pylint:disable=g-complex-comprehension squad_eval.exact_match_score, p, t) for p, t in zip(predictions, targets) ]) f1 = np.mean([ squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, p, t) for p, t in zip(predictions, targets) ]) em *= 100 f1 *= 100 logging.info("EM = %.2f, F1 = %.2f", em, f1) return {"em": em, "f1": f1}
def calc_em_and_f1(best_span_string, answer_strings): exact_match = squad_eval.metric_max_over_ground_truths( squad_eval.exact_match_score, best_span_string, answer_strings) f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, answer_strings) return exact_match, f1_score
def __call__(self, best_span_string, answer_strings): """ Parameters ---------- value : ``float`` The value to average. """ exact_match = squad_eval.metric_max_over_ground_truths( squad_eval.exact_match_score, best_span_string, answer_strings) f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, answer_strings) self._total_em += exact_match self._total_f1 += f1_score self._count += 1
def _compute_f1_score(self, pred_tokens, target_tokens): for i in range(len(target_tokens)): pred_str = self._get_string_from_tokens(pred_tokens[i]) target_str = self._get_string_from_tokens(target_tokens[i]) f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, pred_str, [target_str]) self._official_f1(100 * f1_score)
def max_prec(st_gold, en_gold, st_pred, en_pred, cannotanswer, cannotanswer_pred, ctx_text): batch_size = ans_mask.size(0) max_turns = batch_size // len(ctx_text) prec = [] cannotanswer_mask = (cannotanswer_pred).cpu() for idx, (sg, eg, sp, ep, cm) in enumerate(zip(st_gold.cpu(), en_gold.cpu(), st_pred.cpu(), en_pred.cpu(), cannotanswer_mask)): if cm: prec.append(1) continue ct = ctx_text[idx // max_turns] p = ct[sp:ep+1] golds = [] for j in range(ans_mask.size(-1)): if eg[j] < 0: break golds.append(' '.join(ct[sg[j]:eg[j]+1])) if len(golds) == 0: prec.append(0) else: prec.append(squad_eval.metric_max_over_ground_truths(cached_prec, ' '.join(p), golds)) prec = torch.Tensor(prec).to(ans_mask.device) return prec
def search_qa(self, token_inflections, orig_tokenized, original_loss, question_dict, context, conservative=True, backward=False): perturbed_tokenized = orig_tokenized.copy() max_loss = original_loss num_queries = 0 max_predicted = '' if backward: token_inflections = reversed(token_inflections) detokenizer = MosesDetokenizer(lang='en') for curr_token in token_inflections: max_infl = orig_tokenized[curr_token[0]] for infl in curr_token[1]: perturbed_tokenized[curr_token[0]] = infl perturbed = detokenizer.detokenize(perturbed_tokenized) loss, predicted = self.get_loss(perturbed, question_dict, context) num_queries += 1 if loss > max_loss: max_loss = loss max_infl = infl max_predicted = predicted if conservative and metric_max_over_ground_truths(compute_f1, predicted, question_dict['gold_texts']) == 0: break perturbed_tokenized[curr_token[0]] = max_infl return perturbed_tokenized, max_loss, max_predicted, num_queries
def attack_one(self, question_dict, context, constrain_pos=True): original = question_dict['question'] gold_starts = [ans['answer_start'] for ans in question_dict['answers']] gold_texts = [ans['text'] for ans in question_dict['answers']] gold_ends = [ gold_starts[i] + len(text) for i, text in enumerate(gold_texts) ] question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends)) question_dict['gold_texts'] = gold_texts orig_tokenized = MosesTokenizer(lang='en').tokenize(original) pos_tagged = [ (tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_tokenized, tagset='universal') ] token_inflections = super(LocalSearchQA, self).get_inflections( orig_tokenized, pos_tagged, constrain_pos) original_loss, init_predicted = self.get_loss(original, question_dict, context) # skip too long or too short question if len(orig_tokenized) < 10 or len(orig_tokenized) > 100: return original, 0, None, None # skip wrong predict question if metric_max_over_ground_truths(compute_f1, init_predicted, question_dict['gold_texts']) == 0: return original, 1, None, None _perturbed, _loss, _predicted, _num_queries = self.local_search_qa( token_inflections, orig_tokenized, original_loss, question_dict, context) is_attack_success = False if metric_max_over_ground_truths(compute_f1, _predicted, question_dict['gold_texts']) == 0: is_attack_success = True num_queries = 1 + _num_queries modif_rate = self.get_modif_rate(orig_tokenized, _perturbed) return MosesDetokenizer(lang='en').detokenize( _perturbed), num_queries, modif_rate, is_attack_success
def f1metric(prediction: Union[str, List], ground_truths: List): # type: ignore """ Parameters ----------a prediction: ``Union[str, List]`` The predicted answer from the model evaluated. This could be a string, or a list of string when multiple spans are predicted as answer. ground_truths: ``List`` All the ground truth answer annotations. """ # If you wanted to split this out by answer type, you could look at [1] here and group by # that, instead of only keeping [0]. ground_truth_answer_strings = [ answer_json_to_strings(annotation)[0] for annotation in ground_truths ] exact_match, f1_score = metric_max_over_ground_truths( drop_em_and_f1, prediction, ground_truth_answer_strings) return (exact_match, f1_score)
def morph(self, question_dict, context, constrain_pos=False, conservative=False): original = question_dict['question'] gold_starts = [ans['answer_start'] for ans in question_dict['answers']] gold_texts = [ans['text'] for ans in question_dict['answers']] gold_ends = [gold_starts[i]+len(text) for i, text in enumerate(gold_texts)] question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends)) question_dict['gold_texts'] = gold_texts orig_tokenized = MosesTokenizer(lang='en').tokenize(original) pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_tokenized,tagset='universal')] token_inflections = super(MorpheusQA, self).get_inflections(orig_tokenized, pos_tagged, constrain_pos) original_loss, _ = self.get_loss(original, question_dict, context) forward_perturbed, forward_loss, forward_predicted, num_queries_forward = self.search_qa(token_inflections, orig_tokenized, original_loss, question_dict, context, conservative) if conservative and metric_max_over_ground_truths(compute_f1, forward_predicted, question_dict['gold_texts']) == 0: return MosesDetokenizer(lang='en').detokenize(forward_perturbed), num_queries_forward + 1 backward_perturbed, backward_loss, __, num_queries_backward = self.search_qa(token_inflections, orig_tokenized, original_loss, question_dict, context, conservative, backward=True) num_queries = 1 + num_queries_forward + num_queries_backward if forward_loss > backward_loss: return MosesDetokenizer(lang='en').detokenize(forward_perturbed), num_queries else: return MosesDetokenizer(lang='en').detokenize(backward_perturbed), num_queries
def __call__(self, prediction: Union[str, List], ground_truths: List): """ Parameters ---------- prediction: ``Union[str, List]`` The predicted answer from the model evaluated. This could be a string, or a list of string when multiple spans are predicted as answer. ground_truths: ``List`` All the ground truth answer annotations. """ ground_truth_answer_strings = [ convert_annotation_to_string(annotation)[0] for annotation in ground_truths ] # pylint: disable=unused-variable ground_truth_answer_types = [ convert_annotation_to_string(annotation)[1] for annotation in ground_truths ] exact_match, f1_score = metric_max_over_ground_truths( drop_em_and_f1, prediction, ground_truth_answer_strings) self._total_em += exact_match self._total_f1 += f1_score self._count += 1
def local_search_qa(self, token_inflections, orig_tokenized, original_loss, question_dict, context): perturbed_tokenized = orig_tokenized.copy() # token list (list of str) max_loss = original_loss num_queries = 0 max_predicted = '' detokenizer = MosesDetokenizer(lang='en') while True: new_tokenized_list = [] # new_text_list = [] new_loss_list = [] new_predicted_list = [] for position, candidates in token_inflections: # list of pairs (position, candidates) candidates: list of token # add or swap for infl in candidates: if perturbed_tokenized[position] == infl: continue # do replace new_tokenized = perturbed_tokenized.copy() new_tokenized[position] = infl # form text and eval new_text = detokenizer.detokenize(new_tokenized) new_loss, new_predicted = self.get_loss( new_text, question_dict, context) num_queries += 1 # record new_tokenized_list.append(new_tokenized) new_loss_list.append(new_loss) new_predicted_list.append(new_predicted) # remove if perturbed_tokenized[position] != orig_tokenized[position]: # do replace new_tokenized = perturbed_tokenized.copy() new_tokenized[position] = orig_tokenized[position] # form text and eval new_text = detokenizer.detokenize(new_tokenized) new_loss, new_predicted = self.get_loss( new_text, question_dict, context) num_queries += 1 # record new_tokenized_list.append(new_tokenized) new_loss_list.append(new_loss) new_predicted_list.append(new_predicted) if len(new_loss_list) == 0: # no improve break cur_max_idx = np.argsort(new_loss_list)[-1] cur_max_loss = new_loss_list[cur_max_idx] cur_max_predicted = new_predicted_list[cur_max_idx] # cur_max_text = new_text_list[cur_max_idx] cur_max_tokenized = new_tokenized_list[cur_max_idx] # check stop criteria if metric_max_over_ground_truths(compute_f1, cur_max_predicted, question_dict['gold_texts']) == 0: perturbed_tokenized = cur_max_tokenized max_loss = cur_max_loss max_predicted = cur_max_predicted break if cur_max_loss > max_loss + EPSILON: perturbed_tokenized = cur_max_tokenized max_loss = cur_max_loss max_predicted = cur_max_predicted else: break # =============== check supplement set ====================== # form supplement set supplement_inflections_by_position = { position: [] for position, _ in token_inflections } for position, candidates in token_inflections: for infl in candidates: if perturbed_tokenized[position] != infl: supplement_inflections_by_position[position].append(infl) is_sup_valid = True valid_positions = [] for position, _ in token_inflections: if len(supplement_inflections_by_position[position]) > 1: is_sup_valid = False break if len(supplement_inflections_by_position[position]) == 1: valid_positions.append(position) if len(valid_positions) == 0: is_sup_valid = False if is_sup_valid: print('check supplement') supplement_tokenized = perturbed_tokenized.copy() for position in valid_positions: supplement_tokenized[ position] = supplement_inflections_by_position[position][0] # form text and eval supp_text = detokenizer.detokenize(supplement_tokenized) supp_loss, supp_predicted = self.get_loss(supp_text, question_dict, context) num_queries += 1 if supp_loss > max_loss: max_loss = supp_loss max_predicted = supp_predicted perturbed_tokenized = supplement_tokenized return perturbed_tokenized, max_loss, max_predicted, num_queries
def forward( self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: batch_size, num_of_passage_tokens = passage['bert'].size() # BERT for QA is a fully connected linear layer on top of BERT producing 2 vectors of # start and end spans. embedded_passage = self._text_field_embedder(passage) passage_length = embedded_passage.size(1) logits = self.qa_outputs(embedded_passage) start_logits, end_logits = logits.split(1, dim=-1) span_start_logits = start_logits.squeeze(-1) span_end_logits = end_logits.squeeze(-1) # Adding some masks with numerically stable values passage_mask = util.get_text_field_mask(passage).float() repeated_passage_mask = passage_mask.unsqueeze(1).repeat(1, 1, 1) repeated_passage_mask = repeated_passage_mask.view( batch_size, passage_length) span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7) output_dict: Dict[str, Any] = {} # add span start and end logits for knowledge distillation output_dict: Dict[str, Any] = { "span_start_logits": span_start_logits, "span_end_logits": span_end_logits, } # We may have multiple instances per questions, moving to per-question intances_question_id = [ insta_meta['question_id'] for insta_meta in metadata ] question_instances_split_inds = np.cumsum( np.unique(intances_question_id, return_counts=True)[1])[:-1] per_question_inds = np.split(range(batch_size), question_instances_split_inds) metadata = np.split(metadata, question_instances_split_inds) # Compute the loss. # if span_start is not None and len(np.argwhere(span_start.squeeze().cpu() >= 0)) > 0: if span_start is not None and len( np.argwhere( span_start.squeeze(-1).squeeze(-1).cpu() >= 0)) > 0: # in evaluation some instances may not contain the gold answer, so we need to compute # loss only on those that do. inds_with_gold_answer = np.argwhere( span_start.view(-1).cpu().numpy() >= 0) inds_with_gold_answer = inds_with_gold_answer.squeeze( ) if len(inds_with_gold_answer) > 1 else inds_with_gold_answer if len(inds_with_gold_answer) > 0: loss = nll_loss(util.masked_log_softmax(span_start_logits[inds_with_gold_answer], \ repeated_passage_mask[inds_with_gold_answer]),\ span_start.view(-1)[inds_with_gold_answer], ignore_index=-1) output_dict["loss_start"] = loss loss += nll_loss(util.masked_log_softmax(span_end_logits[inds_with_gold_answer], \ repeated_passage_mask[inds_with_gold_answer]),\ span_end.view(-1)[inds_with_gold_answer], ignore_index=-1) output_dict["loss"] = loss output_dict["loss_end"] = loss - output_dict["loss_start"] # This is a hack for cases in which gold answer is not provided so we cannot compute loss... if 'loss' not in output_dict: output_dict["loss"] = torch.cuda.FloatTensor([0], device=span_end_logits.device) \ if torch.cuda.is_available() else torch.FloatTensor([0]) # Compute F1 and preparing the output dictionary. output_dict['best_span_str'] = [] output_dict['qid'] = [] output_dict["start_bias_weight"] = [] output_dict["end_bias_weight"] = [] # getting best span prediction for best_span = self._get_example_predications(span_start_logits, span_end_logits, self._max_span_length) best_span_cpu = best_span.detach().cpu().numpy() span_start_logits_numpy = span_start_logits.data.cpu().numpy() span_end_logits_numpy = span_end_logits.data.cpu().numpy() # Iterating over every question (which may contain multiple instances, one per chunk) for question_inds, question_instances_metadata in zip( per_question_inds, metadata): best_span_ind = np.argmax( span_start_logits_numpy[question_inds, best_span_cpu[question_inds][:, 0]] + span_end_logits_numpy[question_inds, best_span_cpu[question_inds][:, 1]]) best_span_logit = np.max( span_start_logits_numpy[question_inds, best_span_cpu[question_inds][:, 0]] + span_end_logits_numpy[question_inds, best_span_cpu[question_inds][:, 1]]) passage_str = question_instances_metadata[best_span_ind][ 'original_passage'] offsets = question_instances_metadata[best_span_ind][ 'token_offsets'] predicted_span = best_span_cpu[question_inds[best_span_ind]] start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] # Note: this is a hack, because AllenNLP, when predicting, expects a value for each instance. # But we may have more than 1 chunk per question, and thus less output strings than instances for i in range(len(question_inds)): output_dict['best_span_str'].append(best_span_string) output_dict['qid'].append( question_instances_metadata[best_span_ind]['question_id']) # get the scalar logit value of the predicted span start and end index as bias weight. output_dict["start_bias_weight"].append( util.masked_softmax(span_start_logits[best_span_ind], repeated_passage_mask[best_span_ind])[ best_span_cpu[best_span_ind][0]]) output_dict["end_bias_weight"].append( util.masked_softmax(span_end_logits[best_span_ind], repeated_passage_mask[best_span_ind])[ best_span_cpu[best_span_ind][1]]) f1_score = 0.0 EM_score = 0.0 gold_answer_texts = question_instances_metadata[best_span_ind][ 'answer_texts_list'] if gold_answer_texts: f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, gold_answer_texts) EM_score = squad_eval.metric_max_over_ground_truths( squad_eval.exact_match_score, best_span_string, gold_answer_texts) self._official_f1(100 * f1_score) self._official_EM(100 * EM_score) # TODO move to predict if self._predictions_file is not None: with open(self._predictions_file, 'a') as f: f.write(json.dumps({'question_id':question_instances_metadata[best_span_ind]['question_id'], \ 'best_span_logit':float(best_span_logit), \ 'f1':100 * f1_score, 'EM':100 * EM_score, 'best_span_string':best_span_string,\ 'gold_answer_texts':gold_answer_texts, \ 'qas_used_fraction':1.0}) + '\n') return output_dict
def forward(self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_starts: torch.IntTensor = None, span_ends: torch.IntTensor = None, yesno_labels : torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: batch_size, num_of_passage_tokens = passage['bert'].size() # Executing the BERT model on the word piece ids (input_ids) input_ids = passage['bert'] token_type_ids = torch.zeros_like(input_ids) mask = (input_ids != 0).long() embedded_chunk, pooled_output = \ self._text_field_embedder.token_embedder_bert.bert_model(input_ids=util.combine_initial_dims(input_ids), token_type_ids=util.combine_initial_dims(token_type_ids), attention_mask=util.combine_initial_dims(mask), output_all_encoded_layers=False) # Just measuring some lengths and offsets to handle the converstion between tokens and word-pieces passage_length = embedded_chunk.size(1) mask_min_values, wordpiece_passage_lens = torch.min(mask, dim=1) wordpiece_passage_lens[mask_min_values == 1] = mask.shape[1] offset_min_values, token_passage_lens = torch.min(passage['bert-offsets'], dim=1) token_passage_lens[offset_min_values != 0] = passage['bert-offsets'].shape[1] bert_offsets = passage['bert-offsets'].cpu().numpy() # BERT for QA is a fully connected linear layer on top of BERT producing 2 vectors of # start and end spans. logits = self.qa_outputs(embedded_chunk) start_logits, end_logits = logits.split(1, dim=-1) span_start_logits = start_logits.squeeze(-1) span_end_logits = end_logits.squeeze(-1) # all input is preprocessed before farword is run, counting the yesno vocabulary # will indicate if yesno support is at all needed. if self.vocab.get_vocab_size("yesno_labels") > 1: yesno_logits = self.qa_yesno(torch.max(embedded_chunk, 1)[0]) span_starts.clamp_(0, passage_length) span_ends.clamp_(0, passage_length) # moving to word piece indexes from token indexes of start and end span span_starts_list = [bert_offsets[i, span_starts[i]] if span_starts[i] != 0 else 0 for i in range(batch_size)] span_ends_list = [bert_offsets[i, span_ends[i]] if span_ends[i] != 0 else 0 for i in range(batch_size)] span_starts = torch.cuda.LongTensor(span_starts_list, device=span_end_logits.device) \ if torch.cuda.is_available() else torch.LongTensor(span_starts_list) span_ends = torch.cuda.LongTensor(span_ends_list, device=span_end_logits.device) \ if torch.cuda.is_available() else torch.LongTensor(span_ends_list) loss_fct = CrossEntropyLoss(ignore_index=passage_length) start_loss = loss_fct(start_logits.squeeze(-1), span_starts) end_loss = loss_fct(end_logits.squeeze(-1), span_ends) if self.vocab.get_vocab_size("yesno_labels") > 1 and yesno_labels is not None: yesno_loss = loss_fct(yesno_logits, yesno_labels) loss = (start_loss + end_loss + yesno_loss) / 3 else: loss = (start_loss + end_loss) / 2 output_dict: Dict[str, Any] = {} if loss == 0: # For evaluation purposes only! output_dict["loss"] = torch.cuda.FloatTensor([0], device=span_end_logits.device) \ if torch.cuda.is_available() else torch.FloatTensor([0]) else: output_dict["loss"] = loss # Compute F1 and preparing the output dictionary. output_dict['best_span_str'] = [] output_dict['best_span_logit'] = [] output_dict['cannot_answer_logit'] = [] output_dict['yesno'] = [] output_dict['yesno_logit'] = [] output_dict['qid'] = [] if span_starts is not None: output_dict['EM'] = [] output_dict['f1'] = [] # getting best span prediction for best_span = self._get_example_predications(span_start_logits, span_end_logits, self._max_span_length) best_span_cpu = best_span.detach().cpu().numpy() for instance_ind, instance_metadata in zip(range(batch_size), metadata): best_span_logit = span_start_logits.data.cpu().numpy()[instance_ind, best_span_cpu[instance_ind][0]] + \ span_end_logits.data.cpu().numpy()[instance_ind, best_span_cpu[instance_ind][1]] cannot_answer_logit = span_start_logits.data.cpu().numpy()[instance_ind, 0] + \ span_end_logits.data.cpu().numpy()[instance_ind, 0] if self.vocab.get_vocab_size("yesno_labels") > 1: yesno_maxind = np.argmax(yesno_logits[instance_ind].data.cpu().numpy()) yesno_logit = yesno_logits[instance_ind, yesno_maxind].data.cpu().numpy() yesno_pred = self.vocab.get_token_from_index(yesno_maxind, namespace="yesno_labels") else: yesno_pred = 'no_yesno' yesno_logit = -30.0 passage_str = instance_metadata['original_passage'] offsets = instance_metadata['token_offsets'] predicted_span = best_span_cpu[instance_ind] # In this version yesno if not "no_yesno" will be regarded as final answer before the spans are considered. if yesno_pred != 'no_yesno': best_span_string = yesno_pred else: if cannot_answer_logit + 0.9 > best_span_logit : best_span_string = 'cannot_answer' else: wordpiece_offsets = self.bert_offsets_to_wordpiece_offsets(bert_offsets[instance_ind][0:len(offsets)]) start_offset = offsets[wordpiece_offsets[predicted_span[0] if predicted_span[0] < len(wordpiece_offsets) \ else len(wordpiece_offsets)-1]][0] end_offset = offsets[wordpiece_offsets[predicted_span[1] if predicted_span[1] < len(wordpiece_offsets) \ else len(wordpiece_offsets)-1]][1] best_span_string = passage_str[start_offset:end_offset] output_dict['best_span_str'].append(best_span_string) output_dict['cannot_answer_logit'].append(cannot_answer_logit) output_dict['best_span_logit'].append(best_span_logit) output_dict['yesno'].append(yesno_pred) output_dict['yesno_logit'].append(yesno_logit) output_dict['qid'].append(instance_metadata['question_id']) # In AllenNLP prediction mode we have no gold answers, so let's check if span_starts is not None: yesno_label_ind = yesno_labels.data.cpu().numpy()[instance_ind] yesno_label = self.vocab.get_token_from_index(yesno_label_ind, namespace="yesno_labels") if yesno_label != 'no_yesno': gold_answer_texts = [yesno_label] elif instance_metadata['cannot_answer']: gold_answer_texts = ['cannot_answer'] else: gold_answer_texts = instance_metadata['answer_texts_list'] f1_score = squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, best_span_string, gold_answer_texts) EM_score = squad_eval.metric_max_over_ground_truths(squad_eval.exact_match_score, best_span_string, gold_answer_texts) self._official_f1(100 * f1_score) self._official_EM(100 * EM_score) output_dict['EM'].append(100 * EM_score) output_dict['f1'].append(100 * f1_score) return output_dict
def predict(args): file_path = cached_path(args.model) archive = load_archive(file_path, cuda_device=args.cuda_device) predictor = Predictor.from_archive(archive, 'multiqa_predictor') all_predictions = {} all_full_predictions = [] contexts = [] single_file_path_cached = cached_path(args.dataset) with gzip.open(single_file_path_cached, 'rb') as myzip: for example in myzip: context = json.loads(example) if 'header' in context: continue contexts.append(context) if args.sample_size != -1 and \ sum([len(context['qas']) for context in contexts]) >= args.sample_size: break # predict answers = {} all_scores = {} for context in Tqdm.tqdm(contexts, total=len(contexts)): curr_pred, full_predictions = predictor.predict_json(context) all_predictions.update(curr_pred) all_full_predictions += full_predictions # saving official answers for this context for qa in context['qas']: qid = qa['qid'].split('_q_')[1] if qid not in answers: answers[qid] = [] if 'annotators_answer_candidates' in qa['answers']['open-ended']: for ans_cand in qa['answers']['open-ended'][ 'annotators_answer_candidates']: if 'single_answer' in ans_cand and 'extractive' in ans_cand[ 'single_answer']: answers[qid] += [ (ans_cand['single_answer']['extractive']['answer']) ] if 'aliases' in ans_cand['single_answer'][ 'extractive']: answers[qid] += ans_cand['extractive'][ 'single_answer']['aliases'] elif 'single_answer' in ans_cand and 'yesno' in ans_cand[ 'single_answer']: answers[qid] += [(ans_cand['single_answer']['yesno'])] elif 'cannot_answer' in qa['answers']['open-ended']: answers[qid] += ['cannot_answer'] f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, all_predictions[qid], answers[qid]) EM_score = squad_eval.metric_max_over_ground_truths( squad_eval.exact_match_score, all_predictions[qid], answers[qid]) all_scores[qid] = {'EM': EM_score * 100, 'f1': f1_score * 100} metrics = {} metrics['EM'] = sum([all_scores[q]['EM'] for q in all_scores.keys()]) / \ len(all_scores.keys()) metrics['f1'] = sum([all_scores[q]['f1'] for q in all_scores.keys()]) / \ len(all_scores.keys()) print(json.dumps(metrics)) # running the official evaluation script: metrics = evaluate(answers, all_predictions, True) print(json.dumps(metrics)) # automatic filename generation / or manual if args.prediction_filepath == None: if not os.path.exists('results/' + args.dataset_name): os.makedirs('results/' + args.dataset_name) output_filepath = 'results/' + args.dataset_name + '/' + '_'.join(args.model.split('/')[-2:]).split('.')[0] + '__on__' + \ args.dataset.split('/')[-1].split('.')[0] else: output_filepath = args.output_filepath # formatting the predictions in the specific dataset format in order to run the official eval_script factory = MultiQAFactory() all_predictions = factory.format_predictions(args.dataset_name, all_predictions) # running dataset specific eval script # saving predictions with open(output_filepath + '_predictions.json', 'w') as f: json.dump(all_predictions, f) with open(output_filepath + '_fullpredictions.json', 'w') as f: json.dump(all_full_predictions, f) # storing results with open(output_filepath + '_eval_results.json', 'w') as f: json.dump(metrics, f)
'extractive']: answers[qid] += [ (ans_cand['extractive']['single_answer']['answer']) ] if 'aliases' in ans_cand['extractive'][ 'single_answer']: answers[qid] += ans_cand['extractive'][ 'single_answer']['aliases'] elif 'yesno' in ans_cand and 'single_answer' in ans_cand[ 'yesno']: answers[qid] += [(ans_cand['yesno']['single_answer'])] elif 'cannot_answer' in qa['answers']['open-ended']: answers[qid] += ['cannot_answer'] f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, all_predictions[qid], answers[qid]) EM_score = squad_eval.metric_max_over_ground_truths( squad_eval.exact_match_score, all_predictions[qid], answers[qid]) all_scores[qid] = {'EM': EM_score * 100, 'f1': f1_score * 100} metrics = {} metrics['EM'] = sum([all_scores[q]['EM'] for q in all_scores.keys()]) / \ len(all_scores.keys()) metrics['f1'] = sum([all_scores[q]['f1'] for q in all_scores.keys()]) / \ len(all_scores.keys()) print(json.dumps(metrics)) # running the official evaluation script: metrics = evaluate(answers, all_predictions, True) print(json.dumps(metrics))
def forward( self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, p1_answer_marker: torch.IntTensor = None, p2_answer_marker: torch.IntTensor = None, p3_answer_marker: torch.IntTensor = None, yesno_list: torch.IntTensor = None, followup_list: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- question : Dict[str, torch.LongTensor] From a ``TextField``. passage : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this passage contains the answer to the question, and predicts the beginning and ending positions of the answer within the passage. span_start : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the beginning position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. span_end : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the ending position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. p1_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 0. This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length]. Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>. For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac p2_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 1. It is similar to p1_answer_marker, but marking previous previous answer in passage. p3_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 2. It is similar to p1_answer_marker, but marking previous previous previous answer in passage. yesno_list : ``torch.IntTensor``, optional This is one of the outputs that we are trying to predict. Three way classification (the yes/no/not a yes no question). followup_list : ``torch.IntTensor``, optional This is one of the outputs that we are trying to predict. Three way classification (followup / maybe followup / don't followup). metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question ID, original passage text, and token offsets into the passage for each instance in the batch. We use this for computing official metrics using the official SQuAD evaluation script. The length of this list should be the batch size, and each dictionary should have the keys ``id``, ``original_passage``, and ``token_offsets``. If you only want the best span string and don't care about official metrics, you can omit the ``id`` key. Returns ------- An output dictionary consisting of the followings. Each of the followings is a nested list because first iterates over dialog, then questions in dialog. qid : List[List[str]] A list of list, consisting of question ids. followup : List[List[int]] A list of list, consisting of continuation marker prediction index. (y :yes, m: maybe follow up, n: don't follow up) yesno : List[List[int]] A list of list, consisting of affirmation marker prediction index. (y :yes, x: not a yes/no question, n: np) best_span_str : List[List[str]] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ batch_size, max_qa_count, max_q_len, _ = question[ 'token_characters'].size() total_qa_count = batch_size * max_qa_count qa_mask = torch.ge(followup_list, 0).view(total_qa_count) embedded_question = self._text_field_embedder(question, num_wrapping_dims=1) embedded_question = embedded_question.reshape( total_qa_count, max_q_len, self._text_field_embedder.get_output_dim()) embedded_question = self._variational_dropout(embedded_question) embedded_passage = self._variational_dropout( self._text_field_embedder(passage)) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float() question_mask = question_mask.reshape(total_qa_count, max_q_len) passage_mask = util.get_text_field_mask(passage).float() repeated_passage_mask = passage_mask.unsqueeze(1).repeat( 1, max_qa_count, 1) repeated_passage_mask = repeated_passage_mask.view( total_qa_count, passage_length) if self._num_context_answers > 0: # Encode question turn number inside the dialog into question embedding. question_num_ind = util.get_range_vector( max_qa_count, util.get_device_of(embedded_question)) question_num_ind = question_num_ind.unsqueeze(-1).repeat( 1, max_q_len) question_num_ind = question_num_ind.unsqueeze(0).repeat( batch_size, 1, 1) question_num_ind = question_num_ind.reshape( total_qa_count, max_q_len) question_num_marker_emb = self._question_num_marker( question_num_ind) embedded_question = torch.cat( [embedded_question, question_num_marker_emb], dim=-1) # Encode the previous answers in passage embedding. repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \ view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim()) # batch_size * max_qa_count, passage_length, word_embed_dim p1_answer_marker = p1_answer_marker.view(total_qa_count, passage_length) p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker) repeated_embedded_passage = torch.cat( [repeated_embedded_passage, p1_answer_marker_emb], dim=-1) if self._num_context_answers > 1: p2_answer_marker = p2_answer_marker.view( total_qa_count, passage_length) p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker) repeated_embedded_passage = torch.cat( [repeated_embedded_passage, p2_answer_marker_emb], dim=-1) if self._num_context_answers > 2: p3_answer_marker = p3_answer_marker.view( total_qa_count, passage_length) p3_answer_marker_emb = self._prev_ans_marker( p3_answer_marker) repeated_embedded_passage = torch.cat( [repeated_embedded_passage, p3_answer_marker_emb], dim=-1) repeated_encoded_passage = self._variational_dropout( self._phrase_layer(repeated_embedded_passage, repeated_passage_mask)) else: encoded_passage = self._variational_dropout( self._phrase_layer(embedded_passage, passage_mask)) repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat( 1, max_qa_count, 1, 1) repeated_encoded_passage = repeated_encoded_passage.view( total_qa_count, passage_length, self._encoding_dim) encoded_question = self._variational_dropout( self._phrase_layer(embedded_question, question_mask)) # Shape: (batch_size * max_qa_count, passage_length, question_length) passage_question_similarity = self._matrix_attention( repeated_encoded_passage, encoded_question) # Shape: (batch_size * max_qa_count, passage_length, question_length) passage_question_attention = util.masked_softmax( passage_question_similarity, question_mask) # Shape: (batch_size * max_qa_count, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum( encoded_question, passage_question_attention) # We replace masked values with something really negative here, so they don't affect the # max below. masked_similarity = util.replace_masked_values( passage_question_similarity, question_mask.unsqueeze(1), -1e7) question_passage_similarity = masked_similarity.max( dim=-1)[0].squeeze(-1) question_passage_attention = util.masked_softmax( question_passage_similarity, repeated_passage_mask) # Shape: (batch_size * max_qa_count, encoding_dim) question_passage_vector = util.weighted_sum( repeated_encoded_passage, question_passage_attention) tiled_question_passage_vector = question_passage_vector.unsqueeze( 1).expand(total_qa_count, passage_length, self._encoding_dim) # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4) final_merged_passage = torch.cat([ repeated_encoded_passage, passage_question_vectors, repeated_encoded_passage * passage_question_vectors, repeated_encoded_passage * tiled_question_passage_vector ], dim=-1) final_merged_passage = F.relu(self._merge_atten(final_merged_passage)) residual_layer = self._variational_dropout( self._residual_encoder(final_merged_passage, repeated_passage_mask)) self_attention_matrix = self._self_attention(residual_layer, residual_layer) mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \ * repeated_passage_mask.reshape(total_qa_count, 1, passage_length) self_mask = torch.eye(passage_length, passage_length, device=self_attention_matrix.device) self_mask = self_mask.reshape(1, passage_length, passage_length) mask = mask * (1 - self_mask) self_attention_probs = util.masked_softmax(self_attention_matrix, mask) # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim) self_attention_vecs = torch.matmul(self_attention_probs, residual_layer) self_attention_vecs = torch.cat([ self_attention_vecs, residual_layer, residual_layer * self_attention_vecs ], dim=-1) residual_layer = F.relu( self._merge_self_attention(self_attention_vecs)) final_merged_passage = final_merged_passage + residual_layer # batch_size * maxqa_pair_len * max_passage_len * 200 final_merged_passage = self._variational_dropout(final_merged_passage) start_rep = self._span_start_encoder(final_merged_passage, repeated_passage_mask) span_start_logits = self._span_start_predictor(start_rep).squeeze(-1) end_rep = self._span_end_encoder( torch.cat([final_merged_passage, start_rep], dim=-1), repeated_passage_mask) span_end_logits = self._span_end_predictor(end_rep).squeeze(-1) span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1) span_followup_logits = self._span_followup_predictor(end_rep).squeeze( -1) span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7) # batch_size * maxqa_len_pair, max_document_len span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7) best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits, span_yesno_logits, span_followup_logits, self._max_span_length) output_dict: Dict[str, Any] = {} # Compute the loss. if span_start is not None: loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1), ignore_index=-1) self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask) loss += nll_loss(util.masked_log_softmax(span_end_logits, repeated_passage_mask), span_end.view(-1), ignore_index=-1) self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask) self._span_accuracy(best_span[:, 0:2], torch.stack([span_start, span_end], -1).view(total_qa_count, 2), mask=qa_mask.unsqueeze(1).expand(-1, 2).long()) # add a select for the right span to compute loss gold_span_end_loc = [] span_end = span_end.view( total_qa_count).squeeze().data.cpu().numpy() for i in range(0, total_qa_count): gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 1, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 2, 0)) gold_span_end_loc = span_start.new(gold_span_end_loc) pred_span_end_loc = [] for i in range(0, total_qa_count): pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0)) predicted_end = span_start.new(pred_span_end_loc) _yesno = span_yesno_logits.view(-1).index_select( 0, gold_span_end_loc).view(-1, 3) _followup = span_followup_logits.view(-1).index_select( 0, gold_span_end_loc).view(-1, 3) loss += nll_loss(F.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1) loss += nll_loss(F.log_softmax(_followup, dim=-1), followup_list.view(-1), ignore_index=-1) _yesno = span_yesno_logits.view(-1).index_select( 0, predicted_end).view(-1, 3) _followup = span_followup_logits.view(-1).index_select( 0, predicted_end).view(-1, 3) self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask) self._span_followup_accuracy(_followup, followup_list.view(-1), mask=qa_mask) output_dict["loss"] = loss # Compute F1 and preparing the output dictionary. output_dict['best_span_str'] = [] output_dict['qid'] = [] output_dict['followup'] = [] output_dict['yesno'] = [] best_span_cpu = best_span.detach().cpu().numpy() for i in range(batch_size): passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] f1_score = 0.0 per_dialog_best_span_list = [] per_dialog_yesno_list = [] per_dialog_followup_list = [] per_dialog_query_id_list = [] for per_dialog_query_index, (iid, answer_texts) in enumerate( zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])): predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index]) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] yesno_pred = predicted_span[2] followup_pred = predicted_span[3] per_dialog_yesno_list.append(yesno_pred) per_dialog_followup_list.append(followup_pred) per_dialog_query_id_list.append(iid) best_span_string = passage_str[start_offset:end_offset] per_dialog_best_span_list.append(best_span_string) if answer_texts: if len(answer_texts) > 1: t_f1 = [] # Compute F1 over N-1 human references and averages the scores. for answer_index in range(len(answer_texts)): idxes = list(range(len(answer_texts))) idxes.pop(answer_index) refs = [answer_texts[z] for z in idxes] t_f1.append( squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, refs)) f1_score = 1.0 * sum(t_f1) / len(t_f1) else: f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, answer_texts) self._official_f1(100 * f1_score) output_dict['qid'].append(per_dialog_query_id_list) output_dict['best_span_str'].append(per_dialog_best_span_list) output_dict['yesno'].append(per_dialog_yesno_list) output_dict['followup'].append(per_dialog_followup_list) return output_dict
def forward( self, question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, yesno_list: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: batch_size, max_qa_count, max_q_len, _ = question[ 'token_characters'].size() total_qa_count = batch_size * max_qa_count qa_mask = torch.ge(yesno_list, 0).view(total_qa_count) embedded_question = self._text_field_embedder(question, num_wrapping_dims=1) # total_qa_count * max_q_len * encoding_dim embedded_question = embedded_question.reshape( total_qa_count, max_q_len, self._text_field_embedder.get_output_dim()) embedded_passage = self._text_field_embedder(passage) # split the embedded tensors to get the word embedding and char embedding, elmo embedding and features embedding word_emb_ques, elmo_ques, ques_feat = torch.split(embedded_question, [200, 1024, 40], dim=2) word_emb_pass, elmo_pass, pass_feat = torch.split(embedded_passage, [200, 1024, 40], dim=2) # word embedding and char embedding embedded_question = self._variational_dropout( torch.cat([word_emb_ques, elmo_ques], dim=2)) embedded_passage = self._variational_dropout( torch.cat([word_emb_pass, elmo_pass], dim=2)) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float() question_mask = question_mask.reshape(total_qa_count, max_q_len) passage_mask = util.get_text_field_mask(passage).float() repeated_passage_mask = passage_mask.unsqueeze(1).repeat( 1, max_qa_count, 1) repeated_passage_mask = repeated_passage_mask.view( total_qa_count, passage_length) encode_passage = self._phrase_layer(embedded_passage, passage_mask) projected_passage = self.relu( self.projected_layer(torch.cat([encode_passage, elmo_pass], dim=2))) encode_question = self._phrase_layer(embedded_question, question_mask) projected_question = self.relu( self.projected_layer(torch.cat([encode_question, elmo_ques], dim=2))) encoded_passage = self._variational_dropout(projected_passage) repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat( 1, max_qa_count, 1, 1) repeated_encoded_passage = repeated_encoded_passage.view( total_qa_count, passage_length, self._encoding_dim) repeated_pass_feat = (pass_feat.unsqueeze(1).repeat( 1, max_qa_count, 1, 1)).view(total_qa_count, passage_length, 40) encoded_question = self._variational_dropout(projected_question) # total_qa_count * max_q_len * passage_length # cnt * m * n s = torch.bmm(encoded_question, repeated_encoded_passage.transpose(2, 1)) alpha = util.masked_softmax(s, question_mask.unsqueeze(2).expand( s.size()), dim=1) # cnt * n * h aligned_p = torch.bmm(alpha.transpose(2, 1), encoded_question) # cnt * m * n beta = util.masked_softmax(s, repeated_passage_mask.unsqueeze(1).expand( s.size()), dim=2) # cnt * m * h aligned_q = torch.bmm(beta, repeated_encoded_passage) fused_p = self.fuse_p(repeated_encoded_passage, aligned_p) fused_q = self.fuse_q(encoded_question, aligned_q) # add manual features here q_aware_p = self.projected_lstm( torch.cat([fused_p, repeated_pass_feat], dim=2), repeated_passage_mask) # cnt * n * n # self_p = torch.bmm(q_aware_p, q_aware_p.transpose(2, 1)) # self_p = self.bilinear_self_align(q_aware_p) self_p = self._self_attention(q_aware_p, q_aware_p) # for i in range(passage_length): # self_p[:, i, i] = 0 mask = repeated_passage_mask.reshape( total_qa_count, passage_length, 1) * repeated_passage_mask.reshape( total_qa_count, 1, passage_length) self_mask = torch.eye(passage_length, passage_length, device=self_p.device) self_mask = self_mask.reshape(1, passage_length, passage_length) mask = mask * (1 - self_mask) lamb = util.masked_softmax(self_p, mask, dim=2) # lamb = util.masked_softmax(self_p, repeated_passage_mask, dim=2) # cnt * n * h self_aligned_p = torch.bmm(lamb, q_aware_p) # cnt * n * h fused_self_p = self.fuse_s(q_aware_p, self_aligned_p) # contextual_p = self._variational_dropout(self.contextual_layer_p(fused_self_p, repeated_passage_mask)) contextual_p = self.contextual_layer_p(fused_self_p, repeated_passage_mask) # contextual_q = self._variational_dropout(self.contextual_layer_q(fused_q, question_mask)) contextual_q = self.contextual_layer_q(fused_q, question_mask) # cnt * m gamma = util.masked_softmax( self.linear_self_align(contextual_q).squeeze(2), question_mask, dim=1) # cnt * h weighted_q = torch.bmm(gamma.unsqueeze(1), contextual_q).squeeze(1) span_start_logits = self.bilinear_layer_s(weighted_q, contextual_p) span_end_logits = self.bilinear_layer_e(weighted_q, contextual_p) # cnt * n * 1 cnt * 1 * h span_yesno_logits = self.yesno_predictor( torch.bmm(span_end_logits.unsqueeze(2), weighted_q.unsqueeze(1))) # span_yesno_logits = self.yesno_predictor(contextual_p) span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7) best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits, span_yesno_logits, self._max_span_length) output_dict: Dict[str, Any] = {} # Compute the loss for training if span_start is not None: loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1), ignore_index=-1) self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask) loss += nll_loss(util.masked_log_softmax(span_end_logits, repeated_passage_mask), span_end.view(-1), ignore_index=-1) self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask) self._span_accuracy(best_span[:, 0:2], torch.stack([span_start, span_end], -1).view(total_qa_count, 2), mask=qa_mask.unsqueeze(1).expand(-1, 2).long()) # add a select for the right span to compute loss gold_span_end_loc = [] span_end = span_end.view( total_qa_count).squeeze().data.cpu().numpy() for i in range(0, total_qa_count): gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 1, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 2, 0)) gold_span_end_loc = span_start.new(gold_span_end_loc) pred_span_end_loc = [] for i in range(0, total_qa_count): pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0)) predicted_end = span_start.new(pred_span_end_loc) _yesno = span_yesno_logits.view(-1).index_select( 0, gold_span_end_loc).view(-1, 3) loss += nll_loss(torch.nn.functional.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1) _yesno = span_yesno_logits.view(-1).index_select( 0, predicted_end).view(-1, 3) self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask) output_dict["loss"] = loss # Compute the EM and F1 on SQuAD and add the tokenized input to the output. output_dict['best_span_str'] = [] output_dict['qid'] = [] output_dict['yesno'] = [] best_span_cpu = best_span.detach().cpu().numpy() for i in range(batch_size): passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] f1_score = 0.0 per_dialog_best_span_list = [] per_dialog_yesno_list = [] per_dialog_query_id_list = [] for per_dialog_query_index, (iid, answer_texts) in enumerate( zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])): predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index]) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] yesno_pred = predicted_span[2] per_dialog_yesno_list.append(yesno_pred) per_dialog_query_id_list.append(iid) best_span_string = passage_str[start_offset:end_offset] per_dialog_best_span_list.append(best_span_string) if answer_texts: if len(answer_texts) > 1: t_f1 = [] # Compute F1 over N-1 human references and averages the scores. for answer_index in range(len(answer_texts)): idxes = list(range(len(answer_texts))) idxes.pop(answer_index) refs = [answer_texts[z] for z in idxes] t_f1.append( squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, refs)) f1_score = 1.0 * sum(t_f1) / len(t_f1) else: f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, answer_texts) self._official_f1(100 * f1_score) output_dict['qid'].append(per_dialog_query_id_list) output_dict['best_span_str'].append(per_dialog_best_span_list) output_dict['yesno'].append(per_dialog_yesno_list) return output_dict
def forward(self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, p1_answer_marker: torch.IntTensor = None, p2_answer_marker: torch.IntTensor = None, p3_answer_marker: torch.IntTensor = None, yesno_list: torch.IntTensor = None, followup_list: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- question : Dict[str, torch.LongTensor] From a ``TextField``. passage : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this passage contains the answer to the question, and predicts the beginning and ending positions of the answer within the passage. span_start : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the beginning position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. span_end : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the ending position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. p1_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 0. This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length]. Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>. For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac p2_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 1. It is similar to p1_answer_marker, but marking previous previous answer in passage. p3_answer_marker : ``torch.IntTensor``, optional This is one of the inputs, but only when num_context_answers > 2. It is similar to p1_answer_marker, but marking previous previous previous answer in passage. yesno_list : ``torch.IntTensor``, optional This is one of the outputs that we are trying to predict. Three way classification (the yes/no/not a yes no question). followup_list : ``torch.IntTensor``, optional This is one of the outputs that we are trying to predict. Three way classification (followup / maybe followup / don't followup). metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question ID, original passage text, and token offsets into the passage for each instance in the batch. We use this for computing official metrics using the official SQuAD evaluation script. The length of this list should be the batch size, and each dictionary should have the keys ``id``, ``original_passage``, and ``token_offsets``. If you only want the best span string and don't care about official metrics, you can omit the ``id`` key. Returns ------- An output dictionary consisting of the followings. Each of the followings is a nested list because first iterates over dialog, then questions in dialog. qid : List[List[str]] A list of list, consisting of question ids. followup : List[List[int]] A list of list, consisting of continuation marker prediction index. (y :yes, m: maybe follow up, n: don't follow up) yesno : List[List[int]] A list of list, consisting of affirmation marker prediction index. (y :yes, x: not a yes/no question, n: np) best_span_str : List[List[str]] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ batch_size, max_qa_count, max_q_len, _ = question['token_characters'].size() total_qa_count = batch_size * max_qa_count qa_mask = torch.ge(followup_list, 0).view(total_qa_count) embedded_question = self._text_field_embedder(question, num_wrapping_dims=1) embedded_question = embedded_question.reshape(total_qa_count, max_q_len, self._text_field_embedder.get_output_dim()) embedded_question = self._variational_dropout(embedded_question) embedded_passage = self._variational_dropout(self._text_field_embedder(passage)) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float() question_mask = question_mask.reshape(total_qa_count, max_q_len) passage_mask = util.get_text_field_mask(passage).float() repeated_passage_mask = passage_mask.unsqueeze(1).repeat(1, max_qa_count, 1) repeated_passage_mask = repeated_passage_mask.view(total_qa_count, passage_length) if self._num_context_answers > 0: # Encode question turn number inside the dialog into question embedding. question_num_ind = util.get_range_vector(max_qa_count, util.get_device_of(embedded_question)) question_num_ind = question_num_ind.unsqueeze(-1).repeat(1, max_q_len) question_num_ind = question_num_ind.unsqueeze(0).repeat(batch_size, 1, 1) question_num_ind = question_num_ind.reshape(total_qa_count, max_q_len) question_num_marker_emb = self._question_num_marker(question_num_ind) embedded_question = torch.cat([embedded_question, question_num_marker_emb], dim=-1) # Encode the previous answers in passage embedding. repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \ view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim()) # batch_size * max_qa_count, passage_length, word_embed_dim p1_answer_marker = p1_answer_marker.view(total_qa_count, passage_length) p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker) repeated_embedded_passage = torch.cat([repeated_embedded_passage, p1_answer_marker_emb], dim=-1) if self._num_context_answers > 1: p2_answer_marker = p2_answer_marker.view(total_qa_count, passage_length) p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker) repeated_embedded_passage = torch.cat([repeated_embedded_passage, p2_answer_marker_emb], dim=-1) if self._num_context_answers > 2: p3_answer_marker = p3_answer_marker.view(total_qa_count, passage_length) p3_answer_marker_emb = self._prev_ans_marker(p3_answer_marker) repeated_embedded_passage = torch.cat([repeated_embedded_passage, p3_answer_marker_emb], dim=-1) repeated_encoded_passage = self._variational_dropout(self._phrase_layer(repeated_embedded_passage, repeated_passage_mask)) else: encoded_passage = self._variational_dropout(self._phrase_layer(embedded_passage, passage_mask)) repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1) repeated_encoded_passage = repeated_encoded_passage.view(total_qa_count, passage_length, self._encoding_dim) encoded_question = self._variational_dropout(self._phrase_layer(embedded_question, question_mask)) # Shape: (batch_size * max_qa_count, passage_length, question_length) passage_question_similarity = self._matrix_attention(repeated_encoded_passage, encoded_question) # Shape: (batch_size * max_qa_count, passage_length, question_length) passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask) # Shape: (batch_size * max_qa_count, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention) # We replace masked values with something really negative here, so they don't affect the # max below. masked_similarity = util.replace_masked_values(passage_question_similarity, question_mask.unsqueeze(1), -1e7) question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1) question_passage_attention = util.masked_softmax(question_passage_similarity, repeated_passage_mask) # Shape: (batch_size * max_qa_count, encoding_dim) question_passage_vector = util.weighted_sum(repeated_encoded_passage, question_passage_attention) tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(total_qa_count, passage_length, self._encoding_dim) # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4) final_merged_passage = torch.cat([repeated_encoded_passage, passage_question_vectors, repeated_encoded_passage * passage_question_vectors, repeated_encoded_passage * tiled_question_passage_vector], dim=-1) final_merged_passage = F.relu(self._merge_atten(final_merged_passage)) residual_layer = self._variational_dropout(self._residual_encoder(final_merged_passage, repeated_passage_mask)) self_attention_matrix = self._self_attention(residual_layer, residual_layer) mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \ * repeated_passage_mask.reshape(total_qa_count, 1, passage_length) self_mask = torch.eye(passage_length, passage_length, device=self_attention_matrix.device) self_mask = self_mask.reshape(1, passage_length, passage_length) mask = mask * (1 - self_mask) self_attention_probs = util.masked_softmax(self_attention_matrix, mask) # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim) self_attention_vecs = torch.matmul(self_attention_probs, residual_layer) self_attention_vecs = torch.cat([self_attention_vecs, residual_layer, residual_layer * self_attention_vecs], dim=-1) residual_layer = F.relu(self._merge_self_attention(self_attention_vecs)) final_merged_passage = final_merged_passage + residual_layer # batch_size * maxqa_pair_len * max_passage_len * 200 final_merged_passage = self._variational_dropout(final_merged_passage) start_rep = self._span_start_encoder(final_merged_passage, repeated_passage_mask) span_start_logits = self._span_start_predictor(start_rep).squeeze(-1) end_rep = self._span_end_encoder(torch.cat([final_merged_passage, start_rep], dim=-1), repeated_passage_mask) span_end_logits = self._span_end_predictor(end_rep).squeeze(-1) span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1) span_followup_logits = self._span_followup_predictor(end_rep).squeeze(-1) span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7) # batch_size * maxqa_len_pair, max_document_len span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7) best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits, span_yesno_logits, span_followup_logits, self._max_span_length) output_dict: Dict[str, Any] = {} # Compute the loss. if span_start is not None: loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1), ignore_index=-1) self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask) loss += nll_loss(util.masked_log_softmax(span_end_logits, repeated_passage_mask), span_end.view(-1), ignore_index=-1) self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask) self._span_accuracy(best_span[:, 0:2], torch.stack([span_start, span_end], -1).view(total_qa_count, 2), mask=qa_mask.unsqueeze(1).expand(-1, 2).long()) # add a select for the right span to compute loss gold_span_end_loc = [] span_end = span_end.view(total_qa_count).squeeze().data.cpu().numpy() for i in range(0, total_qa_count): gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3, 0)) gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 1, 0)) gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 2, 0)) gold_span_end_loc = span_start.new(gold_span_end_loc) pred_span_end_loc = [] for i in range(0, total_qa_count): pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3, 0)) pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0)) pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0)) predicted_end = span_start.new(pred_span_end_loc) _yesno = span_yesno_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3) _followup = span_followup_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3) loss += nll_loss(F.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1) loss += nll_loss(F.log_softmax(_followup, dim=-1), followup_list.view(-1), ignore_index=-1) _yesno = span_yesno_logits.view(-1).index_select(0, predicted_end).view(-1, 3) _followup = span_followup_logits.view(-1).index_select(0, predicted_end).view(-1, 3) self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask) self._span_followup_accuracy(_followup, followup_list.view(-1), mask=qa_mask) output_dict["loss"] = loss # Compute F1 and preparing the output dictionary. output_dict['best_span_str'] = [] output_dict['qid'] = [] output_dict['followup'] = [] output_dict['yesno'] = [] best_span_cpu = best_span.detach().cpu().numpy() for i in range(batch_size): passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] f1_score = 0.0 per_dialog_best_span_list = [] per_dialog_yesno_list = [] per_dialog_followup_list = [] per_dialog_query_id_list = [] for per_dialog_query_index, (iid, answer_texts) in enumerate( zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])): predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index]) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] yesno_pred = predicted_span[2] followup_pred = predicted_span[3] per_dialog_yesno_list.append(yesno_pred) per_dialog_followup_list.append(followup_pred) per_dialog_query_id_list.append(iid) best_span_string = passage_str[start_offset:end_offset] per_dialog_best_span_list.append(best_span_string) if answer_texts: if len(answer_texts) > 1: t_f1 = [] # Compute F1 over N-1 human references and averages the scores. for answer_index in range(len(answer_texts)): idxes = list(range(len(answer_texts))) idxes.pop(answer_index) refs = [answer_texts[z] for z in idxes] t_f1.append(squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, best_span_string, refs)) f1_score = 1.0 * sum(t_f1) / len(t_f1) else: f1_score = squad_eval.metric_max_over_ground_truths(squad_eval.f1_score, best_span_string, answer_texts) self._official_f1(100 * f1_score) output_dict['qid'].append(per_dialog_query_id_list) output_dict['best_span_str'].append(per_dialog_best_span_list) output_dict['yesno'].append(per_dialog_yesno_list) output_dict['followup'].append(per_dialog_followup_list) return output_dict