Example #1
0
    def calculate(self, sample_list, model_output, *args, **kwargs):
        answer_processor = registry.get(sample_list.dataset_name + "_answer_processor")

        batch_size = sample_list.context_tokens.size(0)
        pred_answers = model_output["scores"].argmax(dim=-1)
        context_tokens = sample_list.context_tokens.cpu().numpy()
        answers = sample_list.get(self.gt_key).cpu().numpy()
        answer_space_size = answer_processor.get_true_vocab_size()

        predictions = []
        from mmf.utils.distributed import byte_tensor_to_object
        from mmf.utils.text import word_tokenize

        for idx in range(batch_size):
            tokens = byte_tensor_to_object(context_tokens[idx])
            answer_words = []
            for answer_id in pred_answers[idx].tolist():
                if answer_id >= answer_space_size:
                    answer_id -= answer_space_size
                    answer_words.append(word_tokenize(tokens[answer_id]))
                else:
                    if answer_id == answer_processor.EOS_IDX:
                        break
                    answer_words.append(
                        answer_processor.answer_vocab.idx2word(answer_id)
                    )

            pred_answer = " ".join(answer_words).replace(" 's", "'s")
            gt_answers = byte_tensor_to_object(answers[idx])
            predictions.append({"pred_answer": pred_answer, "gt_answers": gt_answers})

        accuracy = self.evaluator.eval_pred_list(predictions)
        accuracy = torch.tensor(accuracy).to(sample_list.context_tokens.device)

        return accuracy
Example #2
0
    def format_for_prediction(self, report):
        answer_processor = self.answer_processor

        batch_size = len(report.question_id)
        pred_answers = report.scores.argmax(dim=-1).view(batch_size, -1)
        answer_space_size = answer_processor.get_true_vocab_size()

        image_ids = report.image_id.cpu().numpy()
        context_tokens = report.context_tokens.cpu().numpy()
        predictions = []
        for idx, question_id in enumerate(report.question_id):
            # collect VQA answers
            image_id = byte_tensor_to_object(image_ids[idx])
            tokens = byte_tensor_to_object(context_tokens[idx])
            answer_words = []
            pred_source = []
            for answer_id in pred_answers[idx].tolist():
                if answer_id >= answer_space_size:
                    answer_id -= answer_space_size
                    answer_words.append(word_tokenize(tokens[answer_id]))
                    pred_source.append("OCR")
                else:
                    if answer_id == answer_processor.EOS_IDX:
                        break
                    answer_words.append(
                        answer_processor.answer_vocab.idx2word(answer_id)
                    )
                    pred_source.append("VOCAB")
            # join all the answer tokens with space
            # (this should be correct for almost all cases)
            pred_answer = " ".join(answer_words).replace(" 's", "'s")
            entry = {
                "question_id": question_id.item(),
                "image_id": image_id,
                "answer": pred_answer,
                "pred_source": pred_source,
            }
            entry = self.postprocess_evalai_entry(entry)

            predictions.append(entry)

        return predictions
Example #3
0
    def format_for_prediction(self, report):
        scores = torch.sigmoid(report.scores)
        binary_scores = scores > self.prediction_threshold
        predictions = []

        for idx, item_id in enumerate(report.id):
            item_id = byte_tensor_to_object(item_id)
            score = binary_scores[idx]
            labels = []
            score = score.nonzero(as_tuple=False)
            for item in score:
                labels.append(self.idx_to_class[item.item()])
            predictions.append({"id": item_id, "labels": labels})

        return predictions
Example #4
0
    def detection_loss_calculation(self, detr_outputs: Dict[str, Tensor],
                                   sample_list):
        hs = detr_outputs["hidden_states"]

        outputs_class = self.class_embeds[sample_list.dataset_name](hs)
        outputs_coord = self.bbox_embeds[sample_list.dataset_name](
            hs).sigmoid()
        detr_outputs.update({
            "pred_logits": outputs_class[-1],
            "pred_boxes": outputs_coord[-1],
            "hs_for_attr": hs[-1],
        })
        # skip loss computation on test set (which usually doesn't contain labels)
        if sample_list.dataset_type != "test":
            if self.config.base_args.aux_loss:
                detr_outputs["aux_outputs"] = [{
                    "pred_logits": a,
                    "pred_boxes": b,
                    "hs_for_attr": c
                } for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1],
                                     hs[:-1])]

            criterion = self.det_losses[sample_list.dataset_name]
            targets = [
                byte_tensor_to_object(t) for t in sample_list.targets_enc
            ]
            targets = [{k: v.to(hs.device)
                        for k, v in t.items()} for t in targets]
            sample_list.targets = targets
            loss_dict = criterion(detr_outputs, sample_list.targets)
            weight_dict = criterion.weight_dict
            loss_prefix = f"{sample_list.dataset_type}/{sample_list.dataset_name}/"
            losses = {(loss_prefix + f"{k}"): loss_dict[k] * weight_dict[k] *
                      self.config.detection_loss_weight
                      for k in loss_dict.keys() if k in weight_dict}
            detr_outputs["losses"] = losses

        if (self.config.heads["detection"][sample_list.dataset_name]
            ["use_attr"] and self.config.predict_attributes):
            hs_for_attr = detr_outputs["hs_for_attr"]
            top_obj_class = detr_outputs["pred_logits"][..., :-1].argmax(
                dim=-1)
            attr_head = self.det_losses[
                sample_list.dataset_name].attribute_head
            detr_outputs["attr_logits"] = attr_head(hs_for_attr, top_obj_class)

        return detr_outputs
Example #5
0
    def format_for_prediction(self, report):
        captions = report.captions.tolist()
        predictions = []
        remove_unk_from_caption_prediction = getattr(
            self.config, "remove_unk_from_caption_prediction", False)

        for idx, image_id in enumerate(report.image_id):
            image_id = byte_tensor_to_object(image_id)
            caption = self.caption_processor(captions[idx])["caption"]
            if remove_unk_from_caption_prediction:
                caption = caption.replace("<unk>", "")
                caption = caption.replace("  ", " ").strip()
            if isinstance(image_id, torch.Tensor):
                image_id = image_id.item()
            predictions.append({"image_id": image_id, "caption": caption})

        return predictions
Example #6
0
    def format_for_prediction(self, report):
        captions = report.captions.tolist()
        cross_attentions = report.cross_attention.tolist()
        predictions = []

        for idx, image_id in enumerate(report.image_id):
            image_id = byte_tensor_to_object(image_id)
            cross_attention = cross_attentions[idx]
            caption = self.caption_processor.id2tokens(captions[idx]).split()
            raw_caption = self.caption_processor.id2rawtoken(captions[idx])
            if isinstance(image_id, torch.Tensor):
                image_id = image_id.item()
            predictions.append({
                "image_id": image_id,
                "caption": caption,
                "cross_attention": cross_attention,
                "raw_caption": raw_caption,
            })

        return predictions
 def test_object_byte_tensor_conversion(self):
     test_obj = [1, "2", {3: 4}, [5]]
     test_obj_bytes = distributed.object_to_byte_tensor(test_obj)
     test_obj_dec = distributed.byte_tensor_to_object(test_obj_bytes)
     self.assertEqual(test_obj_dec, test_obj)