def test_tie_mlm_head_weight_to_encoder(self):
        self._text_modality_config = MMFTransformerModalityConfig(
            type="text",
            key="text",
            embedding_dim=768,
            position_dim=128,
            segment_id=0,
            encoder=TextEncoderFactory.Config(
                type=TextEncoderTypes.transformer),
        )
        heads = [MLM.Config()]
        modalities_config = [
            self._image_modality_config, self._text_modality_config
        ]
        config = MMFTransformer.Config(
            heads=heads,
            modalities=modalities_config,
            num_labels=2,
            tie_weight_to_encoder="text",
        )
        mmft = build_model(config)

        test_utils.compare_tensors(
            mmft.heads[0].cls.predictions.decoder.weight,
            mmft.encoders["text"].embeddings.word_embeddings.weight,
        )
Beispiel #2
0
    def test_report_copy(self):
        original_report = self._build_report()
        report_copy = original_report.copy()

        report_copy["scores"].zero_()

        self.assertFalse(
            test_utils.compare_tensors(report_copy["scores"],
                                       original_report["scores"]))
Beispiel #3
0
    def test_multi_hot_answer_from_vocab_processor(self):
        config = self._get_config("../../../mmf/configs/datasets/clevr/defaults.yaml")
        clevr_config = config.dataset_config.clevr
        answer_processor_config = clevr_config.processors.answer_processor

        # Test num_answers==1 case
        vocab_path = os.path.join(
            os.path.abspath(__file__), "..", "..", "data", "vocab.txt"
        )
        answer_processor_config.params.vocab_file = os.path.abspath(vocab_path)
        answer_processor = MultiHotAnswerFromVocabProcessor(
            answer_processor_config.params
        )
        processed = answer_processor({"answers": ["helmet"]})
        answers_indices = processed["answers_indices"]
        answers_scores = processed["answers_scores"]

        self.assertTrue(
            compare_tensors(answers_indices, torch.tensor([5] * 10, dtype=torch.long))
        )
        expected_answers_scores = torch.zeros(19, dtype=torch.float)
        expected_answers_scores[5] = 1.0
        self.assertTrue(compare_tensors(answers_scores, expected_answers_scores))

        # Test multihot when num answers greater than 1
        answer_processor_config.params.vocab_file = os.path.abspath(vocab_path)
        answer_processor_config.params.num_answers = 3
        answer_processor = MultiHotAnswerFromVocabProcessor(
            answer_processor_config.params
        )
        processed = answer_processor({"answers": ["man", "with", "countryside"]})
        answers_indices = processed["answers_indices"]
        answers_scores = processed["answers_scores"]
        self.assertTrue(
            compare_tensors(
                answers_indices,
                torch.tensor([2, 3, 15, 2, 3, 15, 2, 3, 15, 2], dtype=torch.long),
            )
        )
        expected_answers_scores = torch.zeros(19, dtype=torch.float)
        expected_answers_scores[2] = 1.0
        expected_answers_scores[3] = 1.0
        expected_answers_scores[15] = 1.0
        self.assertTrue(compare_tensors(answers_scores, expected_answers_scores))

        # Test unk
        processed = answer_processor({"answers": ["test", "answer", "man"]})
        answers_indices = processed["answers_indices"]
        answers_scores = processed["answers_scores"]
        self.assertTrue(
            compare_tensors(
                answers_indices,
                torch.tensor([0, 0, 2, 0, 0, 2, 0, 0, 2, 0], dtype=torch.long),
            )
        )
        expected_answers_scores = torch.zeros(19, dtype=torch.float)
        expected_answers_scores[2] = 1.0
        self.assertTrue(compare_tensors(answers_scores, expected_answers_scores))
Beispiel #4
0
    def test_call(self):
        batch_collator = BatchCollator("vqa2", "train")
        sample_list = test_utils.build_random_sample_list()
        sample_list = batch_collator(sample_list)

        # Test already build sample list
        self.assertEqual(sample_list.dataset_name, "vqa2")
        self.assertEqual(sample_list.dataset_type, "train")

        sample = Sample()
        sample.a = torch.tensor([1, 2], dtype=torch.int)

        # Test list of samples
        sample_list = batch_collator([sample, sample])
        self.assertTrue(
            test_utils.compare_tensors(
                sample_list.a, torch.tensor([[1, 2], [1, 2]],
                                            dtype=torch.int)))

        # Test IterableDataset case
        sample_list = test_utils.build_random_sample_list()
        new_sample_list = batch_collator([sample_list])
        self.assertEqual(new_sample_list, sample_list)
    def test_preprocessing_with_resnet_encoder(self):
        self._image_modality_config = MMFTransformerModalityConfig(
            type="image",
            key="image",
            embedding_dim=2048,
            position_dim=1,
            segment_id=0,
            encoder=ImageEncoderFactory.Config(
                type=ImageEncoderTypes.resnet152,
                params=ResNet152ImageEncoder.Config(pretrained=False),
            ),
        )
        modalities_config = [
            self._image_modality_config, self._text_modality_config
        ]
        config = MMFTransformer.Config(modalities=modalities_config,
                                       num_labels=2)
        mmft = build_model(config)

        sample_list = SampleList()
        sample_list.image = torch.rand(2, 3, 224, 224)
        sample_list.text = torch.randint(0, 512, (2, 128))

        transformer_input = mmft.preprocess_sample(sample_list)

        input_ids = transformer_input["input_ids"]
        self.assertEqual(input_ids["image"].dim(), 3)
        self.assertEqual(list(input_ids["image"].size()), [2, 1, 2048])

        self.assertEqual(input_ids["text"].dim(), 2)
        self.assertEqual(list(input_ids["text"].size()), [2, 128])

        position_ids = transformer_input["position_ids"]
        test_utils.compare_tensors(position_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(
            position_ids["text"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))

        masks = transformer_input["masks"]
        test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]]))
        test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long())

        segment_ids = transformer_input["segment_ids"]
        test_utils.compare_tensors(segment_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(segment_ids["text"],
                                   torch.ones((2, 128)).long())
    def test_custom_feature_and_mask_preprocessing(self):
        extra_modality = MMFTransformerModalityConfig(
            type="my_random_feature",
            key="my_random_feature",
            embedding_dim=128,
            position_dim=4,
            segment_id=3,
            encoder=EncoderFactory.Config(type="identity"),
        )

        modalities_config = [
            self._image_modality_config,
            self._text_modality_config,
            extra_modality,
        ]
        config = MMFTransformer.Config(modalities=modalities_config,
                                       num_labels=2)
        mmft = build_model(config)

        sample_list = SampleList()
        sample_list.image = torch.rand(2, 256)
        sample_list.text = torch.randint(0, 512, (2, 128))
        sample_list.text_mask = torch.ones(2, 128)
        sample_list.text_mask[:, 70:] = 0
        sample_list.my_random_feature = torch.rand(2, 4, 128)
        sample_list.my_random_feature_mask = torch.ones(2, 4)
        sample_list.my_random_feature_mask[:, 3:] = 0

        transformer_input = mmft.preprocess_sample(sample_list)
        input_ids = transformer_input["input_ids"]
        self.assertEqual(input_ids["image"].dim(), 3)
        self.assertEqual(list(input_ids["image"].size()), [2, 1, 256])

        self.assertEqual(input_ids["text"].dim(), 2)
        self.assertEqual(list(input_ids["text"].size()), [2, 128])

        self.assertEqual(input_ids["my_random_feature"].dim(), 3)
        self.assertEqual(list(input_ids["my_random_feature"].size()),
                         [2, 4, 128])

        position_ids = transformer_input["position_ids"]
        test_utils.compare_tensors(position_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(
            position_ids["text"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))
        test_utils.compare_tensors(
            position_ids["my_random_feature"],
            torch.arange(0, 4).unsqueeze(0).expand((2, 4)),
        )

        masks = transformer_input["masks"]
        test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]]))
        self.assertEqual(masks["text"].sum().item(), 140)
        self.assertEqual(masks["my_random_feature"].sum().item(), 6)

        segment_ids = transformer_input["segment_ids"]
        test_utils.compare_tensors(segment_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(segment_ids["text"],
                                   torch.ones((2, 128)).long())
        test_utils.compare_tensors(
            segment_ids["my_random_feature"],
            torch.full((2, 4), dtype=torch.long, fill_value=3).long(),
        )
    def _compare_processed_for_multimodality(self,
                                             transformer_input,
                                             lm_labels_sum=0):
        input_ids = transformer_input["input_ids"]
        self.assertEqual(input_ids["image"].dim(), 3)
        self.assertEqual(list(input_ids["image"].size()), [2, 1, 256])

        self.assertEqual(input_ids["body"].dim(), 2)
        self.assertEqual(list(input_ids["body"].size()), [2, 128])

        self.assertEqual(input_ids["ocr"].dim(), 2)
        self.assertEqual(list(input_ids["ocr"].size()), [2, 128])

        # Test specific modality keys case
        # Test encoder with resnet
        # Test input_mask case, test modality_mask case

        position_ids = transformer_input["position_ids"]
        test_utils.compare_tensors(position_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(
            position_ids["body"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))
        test_utils.compare_tensors(
            position_ids["ocr"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))

        masks = transformer_input["masks"]
        test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]]))
        test_utils.compare_tensors(masks["body"], torch.ones((2, 128)).long())
        test_utils.compare_tensors(masks["ocr"], torch.ones((2, 128)).long())

        segment_ids = transformer_input["segment_ids"]
        test_utils.compare_tensors(segment_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(segment_ids["body"],
                                   torch.ones((2, 128)).long())
        test_utils.compare_tensors(
            segment_ids["ocr"],
            torch.full((2, 128), dtype=torch.long, fill_value=2).long(),
        )

        mlm_labels = transformer_input["mlm_labels"]
        self.assertEqual(list(mlm_labels["combined_labels"].size()), [2, 257])
        # -2 is for image negative labels
        self.assertEqual(mlm_labels["combined_labels"].sum().item(),
                         lm_labels_sum - 2)
    def test_one_dim_feature_preprocessing(self):
        modalities_config = [
            self._image_modality_config, self._text_modality_config
        ]
        config = MMFTransformer.Config(modalities=modalities_config,
                                       num_labels=2)
        mmft = build_model(config)

        sample_list = SampleList()
        sample_list.image = torch.rand(2, 256)
        sample_list.text = torch.randint(0, 512, (2, 128))

        transformer_input = mmft.preprocess_sample(sample_list)
        input_ids = transformer_input["input_ids"]
        self.assertEqual(input_ids["image"].dim(), 3)
        self.assertEqual(list(input_ids["image"].size()), [2, 1, 256])

        self.assertEqual(input_ids["text"].dim(), 2)
        self.assertEqual(list(input_ids["text"].size()), [2, 128])

        position_ids = transformer_input["position_ids"]
        test_utils.compare_tensors(position_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(
            position_ids["text"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))

        masks = transformer_input["masks"]
        masks = mmft._infer_masks(sample_list, input_ids)
        test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]]))
        test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long())

        segment_ids = transformer_input["segment_ids"]
        test_utils.compare_tensors(segment_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(segment_ids["text"],
                                   torch.ones((2, 128)).long())

        mlm_labels = transformer_input["mlm_labels"]
        test_utils.compare_tensors(
            mlm_labels["combined_labels"],
            torch.full((2, 129), dtype=torch.long, fill_value=-1),
        )
Beispiel #9
0
    def test_preprocessing_with_mvit_encoder(self):
        encoder_config = OmegaConf.create({
            "name":
            "pytorchvideo",
            "model_name":
            "mvit_base_32x3",
            "random_init":
            True,
            "drop_last_n_layers":
            0,
            "pooler_name":
            "cls",
            "spatial_size":
            224,
            "temporal_size":
            8,
            "head":
            None,
            "embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
            "atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
            "pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]],
            "pool_kv_stride_adaptive": [1, 8, 8],
            "pool_kvq_kernel": [3, 3, 3],
        })
        self._image_modality_config = MMFTransformerModalityConfig(
            type="image",
            key="image",
            embedding_dim=768,
            position_dim=1,
            segment_id=0,
            encoder=encoder_config,
        )
        modalities_config = [
            self._image_modality_config, self._text_modality_config
        ]
        config = MMFTransformer.Config(modalities=modalities_config,
                                       num_labels=2)
        mmft = build_model(config)

        sample_list = SampleList()
        sample_list.image = torch.rand((2, 3, 8, 224, 224))
        sample_list.text = torch.randint(0, 512, (2, 128))

        transformer_input = mmft.preprocess_sample(sample_list)
        input_ids = transformer_input["input_ids"]
        self.assertEqual(input_ids["image"].dim(), 3)
        self.assertEqual(list(input_ids["image"].size()), [2, 1, 768])

        self.assertEqual(input_ids["text"].dim(), 2)
        self.assertEqual(list(input_ids["text"].size()), [2, 128])

        position_ids = transformer_input["position_ids"]
        test_utils.compare_tensors(position_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(
            position_ids["text"],
            torch.arange(0, 128).unsqueeze(0).expand((2, 128)))

        masks = transformer_input["masks"]
        test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]]))
        test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long())

        segment_ids = transformer_input["segment_ids"]
        test_utils.compare_tensors(segment_ids["image"],
                                   torch.tensor([[0], [0]]))
        test_utils.compare_tensors(segment_ids["text"],
                                   torch.ones((2, 128)).long())