def calculate(self, sample_list, model_output, *args, **kwargs): answer_processor = registry.get(sample_list.dataset_name + "_answer_processor") batch_size = sample_list.context_tokens.size(0) pred_answers = model_output["scores"].argmax(dim=-1) context_tokens = sample_list.context_tokens.cpu().numpy() answers = sample_list.get(self.gt_key).cpu().numpy() answer_space_size = answer_processor.get_true_vocab_size() predictions = [] from mmf.utils.distributed import byte_tensor_to_object from mmf.utils.text import word_tokenize for idx in range(batch_size): tokens = byte_tensor_to_object(context_tokens[idx]) answer_words = [] for answer_id in pred_answers[idx].tolist(): if answer_id >= answer_space_size: answer_id -= answer_space_size answer_words.append(word_tokenize(tokens[answer_id])) else: if answer_id == answer_processor.EOS_IDX: break answer_words.append( answer_processor.answer_vocab.idx2word(answer_id) ) pred_answer = " ".join(answer_words).replace(" 's", "'s") gt_answers = byte_tensor_to_object(answers[idx]) predictions.append({"pred_answer": pred_answer, "gt_answers": gt_answers}) accuracy = self.evaluator.eval_pred_list(predictions) accuracy = torch.tensor(accuracy).to(sample_list.context_tokens.device) return accuracy
def format_for_evalai(self, batch, answers): answers = answers.argmax(dim=1) predictions = [] for idx, question_id in enumerate(batch["question_id"]): answer_id = answers[idx] if answer_id >= self.answer_space_size: answer_id -= self.answer_space_size answer = word_tokenize(batch["ocr_tokens"][answer_id][idx]) else: answer = self.answer_dict.idx2word(answer_id) predictions.append({ "question_id": question_id.item(), "answer": answer }) return predictions
def format_for_prediction(self, report): answer_processor = self.answer_processor batch_size = len(report.question_id) pred_answers = report.scores.argmax(dim=-1).view(batch_size, -1) answer_space_size = answer_processor.get_true_vocab_size() image_ids = report.image_id.cpu().numpy() context_tokens = report.context_tokens.cpu().numpy() predictions = [] for idx, question_id in enumerate(report.question_id): # collect VQA answers image_id = byte_tensor_to_object(image_ids[idx]) tokens = byte_tensor_to_object(context_tokens[idx]) answer_words = [] pred_source = [] for answer_id in pred_answers[idx].tolist(): if answer_id >= answer_space_size: answer_id -= answer_space_size answer_words.append(word_tokenize(tokens[answer_id])) pred_source.append("OCR") else: if answer_id == answer_processor.EOS_IDX: break answer_words.append( answer_processor.answer_vocab.idx2word(answer_id) ) pred_source.append("VOCAB") # join all the answer tokens with space # (this should be correct for almost all cases) pred_answer = " ".join(answer_words).replace(" 's", "'s") entry = { "question_id": question_id.item(), "image_id": image_id, "answer": pred_answer, "pred_source": pred_source, } entry = self.postprocess_evalai_entry(entry) predictions.append(entry) return predictions