def test_tie_mlm_head_weight_to_encoder(self): self._text_modality_config = MMFTransformerModalityConfig( type="text", key="text", embedding_dim=768, position_dim=128, segment_id=0, encoder=TextEncoderFactory.Config( type=TextEncoderTypes.transformer), ) heads = [MLM.Config()] modalities_config = [ self._image_modality_config, self._text_modality_config ] config = MMFTransformer.Config( heads=heads, modalities=modalities_config, num_labels=2, tie_weight_to_encoder="text", ) mmft = build_model(config) test_utils.compare_tensors( mmft.heads[0].cls.predictions.decoder.weight, mmft.encoders["text"].embeddings.word_embeddings.weight, )
def test_report_copy(self): original_report = self._build_report() report_copy = original_report.copy() report_copy["scores"].zero_() self.assertFalse( test_utils.compare_tensors(report_copy["scores"], original_report["scores"]))
def test_multi_hot_answer_from_vocab_processor(self): config = self._get_config("../../../mmf/configs/datasets/clevr/defaults.yaml") clevr_config = config.dataset_config.clevr answer_processor_config = clevr_config.processors.answer_processor # Test num_answers==1 case vocab_path = os.path.join( os.path.abspath(__file__), "..", "..", "data", "vocab.txt" ) answer_processor_config.params.vocab_file = os.path.abspath(vocab_path) answer_processor = MultiHotAnswerFromVocabProcessor( answer_processor_config.params ) processed = answer_processor({"answers": ["helmet"]}) answers_indices = processed["answers_indices"] answers_scores = processed["answers_scores"] self.assertTrue( compare_tensors(answers_indices, torch.tensor([5] * 10, dtype=torch.long)) ) expected_answers_scores = torch.zeros(19, dtype=torch.float) expected_answers_scores[5] = 1.0 self.assertTrue(compare_tensors(answers_scores, expected_answers_scores)) # Test multihot when num answers greater than 1 answer_processor_config.params.vocab_file = os.path.abspath(vocab_path) answer_processor_config.params.num_answers = 3 answer_processor = MultiHotAnswerFromVocabProcessor( answer_processor_config.params ) processed = answer_processor({"answers": ["man", "with", "countryside"]}) answers_indices = processed["answers_indices"] answers_scores = processed["answers_scores"] self.assertTrue( compare_tensors( answers_indices, torch.tensor([2, 3, 15, 2, 3, 15, 2, 3, 15, 2], dtype=torch.long), ) ) expected_answers_scores = torch.zeros(19, dtype=torch.float) expected_answers_scores[2] = 1.0 expected_answers_scores[3] = 1.0 expected_answers_scores[15] = 1.0 self.assertTrue(compare_tensors(answers_scores, expected_answers_scores)) # Test unk processed = answer_processor({"answers": ["test", "answer", "man"]}) answers_indices = processed["answers_indices"] answers_scores = processed["answers_scores"] self.assertTrue( compare_tensors( answers_indices, torch.tensor([0, 0, 2, 0, 0, 2, 0, 0, 2, 0], dtype=torch.long), ) ) expected_answers_scores = torch.zeros(19, dtype=torch.float) expected_answers_scores[2] = 1.0 self.assertTrue(compare_tensors(answers_scores, expected_answers_scores))
def test_call(self): batch_collator = BatchCollator("vqa2", "train") sample_list = test_utils.build_random_sample_list() sample_list = batch_collator(sample_list) # Test already build sample list self.assertEqual(sample_list.dataset_name, "vqa2") self.assertEqual(sample_list.dataset_type, "train") sample = Sample() sample.a = torch.tensor([1, 2], dtype=torch.int) # Test list of samples sample_list = batch_collator([sample, sample]) self.assertTrue( test_utils.compare_tensors( sample_list.a, torch.tensor([[1, 2], [1, 2]], dtype=torch.int))) # Test IterableDataset case sample_list = test_utils.build_random_sample_list() new_sample_list = batch_collator([sample_list]) self.assertEqual(new_sample_list, sample_list)
def test_preprocessing_with_resnet_encoder(self): self._image_modality_config = MMFTransformerModalityConfig( type="image", key="image", embedding_dim=2048, position_dim=1, segment_id=0, encoder=ImageEncoderFactory.Config( type=ImageEncoderTypes.resnet152, params=ResNet152ImageEncoder.Config(pretrained=False), ), ) modalities_config = [ self._image_modality_config, self._text_modality_config ] config = MMFTransformer.Config(modalities=modalities_config, num_labels=2) mmft = build_model(config) sample_list = SampleList() sample_list.image = torch.rand(2, 3, 224, 224) sample_list.text = torch.randint(0, 512, (2, 128)) transformer_input = mmft.preprocess_sample(sample_list) input_ids = transformer_input["input_ids"] self.assertEqual(input_ids["image"].dim(), 3) self.assertEqual(list(input_ids["image"].size()), [2, 1, 2048]) self.assertEqual(input_ids["text"].dim(), 2) self.assertEqual(list(input_ids["text"].size()), [2, 128]) position_ids = transformer_input["position_ids"] test_utils.compare_tensors(position_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors( position_ids["text"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) masks = transformer_input["masks"] test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]])) test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long()) segment_ids = transformer_input["segment_ids"] test_utils.compare_tensors(segment_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors(segment_ids["text"], torch.ones((2, 128)).long())
def test_custom_feature_and_mask_preprocessing(self): extra_modality = MMFTransformerModalityConfig( type="my_random_feature", key="my_random_feature", embedding_dim=128, position_dim=4, segment_id=3, encoder=EncoderFactory.Config(type="identity"), ) modalities_config = [ self._image_modality_config, self._text_modality_config, extra_modality, ] config = MMFTransformer.Config(modalities=modalities_config, num_labels=2) mmft = build_model(config) sample_list = SampleList() sample_list.image = torch.rand(2, 256) sample_list.text = torch.randint(0, 512, (2, 128)) sample_list.text_mask = torch.ones(2, 128) sample_list.text_mask[:, 70:] = 0 sample_list.my_random_feature = torch.rand(2, 4, 128) sample_list.my_random_feature_mask = torch.ones(2, 4) sample_list.my_random_feature_mask[:, 3:] = 0 transformer_input = mmft.preprocess_sample(sample_list) input_ids = transformer_input["input_ids"] self.assertEqual(input_ids["image"].dim(), 3) self.assertEqual(list(input_ids["image"].size()), [2, 1, 256]) self.assertEqual(input_ids["text"].dim(), 2) self.assertEqual(list(input_ids["text"].size()), [2, 128]) self.assertEqual(input_ids["my_random_feature"].dim(), 3) self.assertEqual(list(input_ids["my_random_feature"].size()), [2, 4, 128]) position_ids = transformer_input["position_ids"] test_utils.compare_tensors(position_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors( position_ids["text"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) test_utils.compare_tensors( position_ids["my_random_feature"], torch.arange(0, 4).unsqueeze(0).expand((2, 4)), ) masks = transformer_input["masks"] test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]])) self.assertEqual(masks["text"].sum().item(), 140) self.assertEqual(masks["my_random_feature"].sum().item(), 6) segment_ids = transformer_input["segment_ids"] test_utils.compare_tensors(segment_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors(segment_ids["text"], torch.ones((2, 128)).long()) test_utils.compare_tensors( segment_ids["my_random_feature"], torch.full((2, 4), dtype=torch.long, fill_value=3).long(), )
def _compare_processed_for_multimodality(self, transformer_input, lm_labels_sum=0): input_ids = transformer_input["input_ids"] self.assertEqual(input_ids["image"].dim(), 3) self.assertEqual(list(input_ids["image"].size()), [2, 1, 256]) self.assertEqual(input_ids["body"].dim(), 2) self.assertEqual(list(input_ids["body"].size()), [2, 128]) self.assertEqual(input_ids["ocr"].dim(), 2) self.assertEqual(list(input_ids["ocr"].size()), [2, 128]) # Test specific modality keys case # Test encoder with resnet # Test input_mask case, test modality_mask case position_ids = transformer_input["position_ids"] test_utils.compare_tensors(position_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors( position_ids["body"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) test_utils.compare_tensors( position_ids["ocr"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) masks = transformer_input["masks"] test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]])) test_utils.compare_tensors(masks["body"], torch.ones((2, 128)).long()) test_utils.compare_tensors(masks["ocr"], torch.ones((2, 128)).long()) segment_ids = transformer_input["segment_ids"] test_utils.compare_tensors(segment_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors(segment_ids["body"], torch.ones((2, 128)).long()) test_utils.compare_tensors( segment_ids["ocr"], torch.full((2, 128), dtype=torch.long, fill_value=2).long(), ) mlm_labels = transformer_input["mlm_labels"] self.assertEqual(list(mlm_labels["combined_labels"].size()), [2, 257]) # -2 is for image negative labels self.assertEqual(mlm_labels["combined_labels"].sum().item(), lm_labels_sum - 2)
def test_one_dim_feature_preprocessing(self): modalities_config = [ self._image_modality_config, self._text_modality_config ] config = MMFTransformer.Config(modalities=modalities_config, num_labels=2) mmft = build_model(config) sample_list = SampleList() sample_list.image = torch.rand(2, 256) sample_list.text = torch.randint(0, 512, (2, 128)) transformer_input = mmft.preprocess_sample(sample_list) input_ids = transformer_input["input_ids"] self.assertEqual(input_ids["image"].dim(), 3) self.assertEqual(list(input_ids["image"].size()), [2, 1, 256]) self.assertEqual(input_ids["text"].dim(), 2) self.assertEqual(list(input_ids["text"].size()), [2, 128]) position_ids = transformer_input["position_ids"] test_utils.compare_tensors(position_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors( position_ids["text"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) masks = transformer_input["masks"] masks = mmft._infer_masks(sample_list, input_ids) test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]])) test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long()) segment_ids = transformer_input["segment_ids"] test_utils.compare_tensors(segment_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors(segment_ids["text"], torch.ones((2, 128)).long()) mlm_labels = transformer_input["mlm_labels"] test_utils.compare_tensors( mlm_labels["combined_labels"], torch.full((2, 129), dtype=torch.long, fill_value=-1), )
def test_preprocessing_with_mvit_encoder(self): encoder_config = OmegaConf.create({ "name": "pytorchvideo", "model_name": "mvit_base_32x3", "random_init": True, "drop_last_n_layers": 0, "pooler_name": "cls", "spatial_size": 224, "temporal_size": 8, "head": None, "embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]], "atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]], "pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]], "pool_kv_stride_adaptive": [1, 8, 8], "pool_kvq_kernel": [3, 3, 3], }) self._image_modality_config = MMFTransformerModalityConfig( type="image", key="image", embedding_dim=768, position_dim=1, segment_id=0, encoder=encoder_config, ) modalities_config = [ self._image_modality_config, self._text_modality_config ] config = MMFTransformer.Config(modalities=modalities_config, num_labels=2) mmft = build_model(config) sample_list = SampleList() sample_list.image = torch.rand((2, 3, 8, 224, 224)) sample_list.text = torch.randint(0, 512, (2, 128)) transformer_input = mmft.preprocess_sample(sample_list) input_ids = transformer_input["input_ids"] self.assertEqual(input_ids["image"].dim(), 3) self.assertEqual(list(input_ids["image"].size()), [2, 1, 768]) self.assertEqual(input_ids["text"].dim(), 2) self.assertEqual(list(input_ids["text"].size()), [2, 128]) position_ids = transformer_input["position_ids"] test_utils.compare_tensors(position_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors( position_ids["text"], torch.arange(0, 128).unsqueeze(0).expand((2, 128))) masks = transformer_input["masks"] test_utils.compare_tensors(masks["image"], torch.tensor([[1], [1]])) test_utils.compare_tensors(masks["text"], torch.ones((2, 128)).long()) segment_ids = transformer_input["segment_ids"] test_utils.compare_tensors(segment_ids["image"], torch.tensor([[0], [0]])) test_utils.compare_tensors(segment_ids["text"], torch.ones((2, 128)).long())