def calculate(self, sample_list, model_output, *args, **kwargs): answer_processor = registry.get(sample_list.dataset_name + "_answer_processor") batch_size = sample_list.context_tokens.size(0) pred_answers = model_output["scores"].argmax(dim=-1) context_tokens = sample_list.context_tokens.cpu().numpy() answers = sample_list.get(self.gt_key).cpu().numpy() answer_space_size = answer_processor.get_true_vocab_size() predictions = [] from mmf.utils.distributed import byte_tensor_to_object from mmf.utils.text import word_tokenize for idx in range(batch_size): tokens = byte_tensor_to_object(context_tokens[idx]) answer_words = [] for answer_id in pred_answers[idx].tolist(): if answer_id >= answer_space_size: answer_id -= answer_space_size answer_words.append(word_tokenize(tokens[answer_id])) else: if answer_id == answer_processor.EOS_IDX: break answer_words.append( answer_processor.answer_vocab.idx2word(answer_id) ) pred_answer = " ".join(answer_words).replace(" 's", "'s") gt_answers = byte_tensor_to_object(answers[idx]) predictions.append({"pred_answer": pred_answer, "gt_answers": gt_answers}) accuracy = self.evaluator.eval_pred_list(predictions) accuracy = torch.tensor(accuracy).to(sample_list.context_tokens.device) return accuracy
def format_for_prediction(self, report): answer_processor = self.answer_processor batch_size = len(report.question_id) pred_answers = report.scores.argmax(dim=-1).view(batch_size, -1) answer_space_size = answer_processor.get_true_vocab_size() image_ids = report.image_id.cpu().numpy() context_tokens = report.context_tokens.cpu().numpy() predictions = [] for idx, question_id in enumerate(report.question_id): # collect VQA answers image_id = byte_tensor_to_object(image_ids[idx]) tokens = byte_tensor_to_object(context_tokens[idx]) answer_words = [] pred_source = [] for answer_id in pred_answers[idx].tolist(): if answer_id >= answer_space_size: answer_id -= answer_space_size answer_words.append(word_tokenize(tokens[answer_id])) pred_source.append("OCR") else: if answer_id == answer_processor.EOS_IDX: break answer_words.append( answer_processor.answer_vocab.idx2word(answer_id) ) pred_source.append("VOCAB") # join all the answer tokens with space # (this should be correct for almost all cases) pred_answer = " ".join(answer_words).replace(" 's", "'s") entry = { "question_id": question_id.item(), "image_id": image_id, "answer": pred_answer, "pred_source": pred_source, } entry = self.postprocess_evalai_entry(entry) predictions.append(entry) return predictions
def format_for_prediction(self, report): scores = torch.sigmoid(report.scores) binary_scores = scores > self.prediction_threshold predictions = [] for idx, item_id in enumerate(report.id): item_id = byte_tensor_to_object(item_id) score = binary_scores[idx] labels = [] score = score.nonzero(as_tuple=False) for item in score: labels.append(self.idx_to_class[item.item()]) predictions.append({"id": item_id, "labels": labels}) return predictions
def detection_loss_calculation(self, detr_outputs: Dict[str, Tensor], sample_list): hs = detr_outputs["hidden_states"] outputs_class = self.class_embeds[sample_list.dataset_name](hs) outputs_coord = self.bbox_embeds[sample_list.dataset_name]( hs).sigmoid() detr_outputs.update({ "pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1], "hs_for_attr": hs[-1], }) # skip loss computation on test set (which usually doesn't contain labels) if sample_list.dataset_type != "test": if self.config.base_args.aux_loss: detr_outputs["aux_outputs"] = [{ "pred_logits": a, "pred_boxes": b, "hs_for_attr": c } for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], hs[:-1])] criterion = self.det_losses[sample_list.dataset_name] targets = [ byte_tensor_to_object(t) for t in sample_list.targets_enc ] targets = [{k: v.to(hs.device) for k, v in t.items()} for t in targets] sample_list.targets = targets loss_dict = criterion(detr_outputs, sample_list.targets) weight_dict = criterion.weight_dict loss_prefix = f"{sample_list.dataset_type}/{sample_list.dataset_name}/" losses = {(loss_prefix + f"{k}"): loss_dict[k] * weight_dict[k] * self.config.detection_loss_weight for k in loss_dict.keys() if k in weight_dict} detr_outputs["losses"] = losses if (self.config.heads["detection"][sample_list.dataset_name] ["use_attr"] and self.config.predict_attributes): hs_for_attr = detr_outputs["hs_for_attr"] top_obj_class = detr_outputs["pred_logits"][..., :-1].argmax( dim=-1) attr_head = self.det_losses[ sample_list.dataset_name].attribute_head detr_outputs["attr_logits"] = attr_head(hs_for_attr, top_obj_class) return detr_outputs
def format_for_prediction(self, report): captions = report.captions.tolist() predictions = [] remove_unk_from_caption_prediction = getattr( self.config, "remove_unk_from_caption_prediction", False) for idx, image_id in enumerate(report.image_id): image_id = byte_tensor_to_object(image_id) caption = self.caption_processor(captions[idx])["caption"] if remove_unk_from_caption_prediction: caption = caption.replace("<unk>", "") caption = caption.replace(" ", " ").strip() if isinstance(image_id, torch.Tensor): image_id = image_id.item() predictions.append({"image_id": image_id, "caption": caption}) return predictions
def format_for_prediction(self, report): captions = report.captions.tolist() cross_attentions = report.cross_attention.tolist() predictions = [] for idx, image_id in enumerate(report.image_id): image_id = byte_tensor_to_object(image_id) cross_attention = cross_attentions[idx] caption = self.caption_processor.id2tokens(captions[idx]).split() raw_caption = self.caption_processor.id2rawtoken(captions[idx]) if isinstance(image_id, torch.Tensor): image_id = image_id.item() predictions.append({ "image_id": image_id, "caption": caption, "cross_attention": cross_attention, "raw_caption": raw_caption, }) return predictions
def test_object_byte_tensor_conversion(self): test_obj = [1, "2", {3: 4}, [5]] test_obj_bytes = distributed.object_to_byte_tensor(test_obj) test_obj_dec = distributed.byte_tensor_to_object(test_obj_bytes) self.assertEqual(test_obj_dec, test_obj)