def test_beam_search(self): vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES) model_config = self.config.model_config.butd model = TestDecoderModel(model_config, vocab) model.build() model.eval() expected_tokens = { 1: [1.0, 23.0, 1.0, 24.0, 29.0, 37.0, 40.0, 17.0, 29.0, 2.0], 2: [1.0, 0.0, 8.0, 1.0, 28.0, 25.0, 2.0], 8: [1.0, 34.0, 1.0, 13.0, 1.0, 2.0], 16: [1.0, 25.0, 18.0, 2.0], } for batch_size in [1, 2, 8, 16]: samples = [] for _ in range(batch_size): sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = torch.randn(100, 2048) sample.answers = torch.zeros((5, 10), dtype=torch.long) samples.append(sample) sample_list = SampleList(samples) tokens = model(sample_list)["captions"] self.assertEqual(np.trim_zeros(tokens[0].tolist()), expected_tokens[batch_size])
def test_caption_bleu4(self): path = os.path.join( os.path.abspath(__file__), "../../../mmf/configs/datasets/coco/defaults.yaml", ) config = load_yaml(os.path.abspath(path)) captioning_config = config.dataset_config.coco caption_processor_config = captioning_config.processors.caption_processor vocab_path = os.path.join(os.path.abspath(__file__), "..", "..", "data", "vocab.txt") caption_processor_config.params.vocab.type = "random" caption_processor_config.params.vocab.vocab_file = os.path.abspath( vocab_path) caption_processor = CaptionProcessor(caption_processor_config.params) registry.register("coco_caption_processor", caption_processor) caption_bleu4 = metrics.CaptionBleu4Metric() expected = Sample() predicted = dict() # Test complete match expected.answers = torch.empty((5, 5, 10)) expected.answers.fill_(4) predicted["scores"] = torch.zeros((5, 10, 19)) predicted["scores"][:, :, 4] = 1.0 self.assertEqual( caption_bleu4.calculate(expected, predicted).item(), 1.0) # Test partial match expected.answers = torch.empty((5, 5, 10)) expected.answers.fill_(4) predicted["scores"] = torch.zeros((5, 10, 19)) predicted["scores"][:, 0:5, 4] = 1.0 predicted["scores"][:, 5:, 18] = 1.0 self.assertAlmostEqual( caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4)
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() plot = sample_info["plot"] if isinstance(plot, list): plot = plot[0] processed_sentence = self.text_processor({"text": plot}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_images is True: current_sample.image = self.image_db[idx]["images"][0] processed = self.answer_processor({"answers": sample_info["genres"]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] return current_sample
def test_nucleus_sampling(self): vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES) model_config = self.config.model_config.butd model = TestDecoderModel(model_config, vocab) model.build() model.eval() sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = torch.randn(100, 2048) sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) tokens = model(sample_list)["captions"] # these are expected tokens for sum_threshold = 0.5 # Because of a bug fix in https://github.com/pytorch/pytorch/pull/47386 # the torch.Tensor.multinomail will generate different random sequence. # TODO: Remove this hack after OSS uses later version of PyTorch. if LegacyVersion(torch.__version__) > LegacyVersion("1.7.1"): expected_tokens = [1.0, 23.0, 38.0, 30.0, 5.0, 11.0, 2.0] else: expected_tokens = [ 1.0, 29.0, 11.0, 11.0, 39.0, 10.0, 31.0, 4.0, 19.0, 39.0, 2.0, ] self.assertEqual(tokens[0].tolist(), expected_tokens)
def __getitem__(self, idx): data = self.questions[idx] # Each call to __getitem__ from dataloader returns a Sample class object which # collated by our special batch collator to a SampleList which is basically # a attribute based batch in layman terms current_sample = Sample() question = data["question"] tokens = tokenize(question, keep=[";", ","], remove=["?", "."]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] processed = self.answer_processor({"answers": [data["answer"]]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] image_path = os.path.join(self.image_path, data["image_filename"]) image = np.true_divide(Image.open(image_path).convert("RGB"), 255) image = image.astype(np.float32) current_sample.image = torch.from_numpy(image.transpose(2, 0, 1)) return current_sample
def load_item(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() processed_caption = self.text_processor( {"text": sample_info["captions"][0]}) current_sample.text = processed_caption["text"] current_sample.caption_len = torch.tensor(len( processed_caption["text"]), dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample.answers = torch.stack([processed_caption["text"]]) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() plot = sample_info["plot"] if isinstance(plot, list): plot = plot[0] processed_sentence = self.text_processor({"text": plot}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_features is True: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.update(features) processed = self.answer_processor({"answers": sample_info["genres"]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] return current_sample