Example #1
0
    def add_sample_details(self, sample_info, sample):
        # 1. Load text (question words)
        # breaking change from VQA2Dataset:
        # load the entire question string, not tokenized questions, since we
        # switch to BERT tokenizer in M4C and do online tokenization
        question_str = (sample_info['question'] if 'question' in sample_info
                        else sample_info['question_str'])
        processed_question = self.text_processor({"question": question_str})
        sample.text = processed_question['token_inds']
        sample.text_len = processed_question['token_num']

        # 2. Load object
        # object bounding box information
        sample.obj_bbox_coordinates = self.copy_processor(
            {"blob": sample_info["obj_normalized_boxes"]})["blob"]

        # 3. Load OCR
        assert self.use_ocr and self.use_ocr_info, \
            'use_ocr and use_ocr_info must be both True for M4CTextVQADataset'
        # Preprocess OCR tokens
        ocr_tokens = [
            self.ocr_token_processor({"text": token})["text"]
            for token in sample_info["ocr_tokens"]
        ]
        # Get FastText embeddings for OCR tokens
        context = self.context_processor({"tokens": ocr_tokens})
        sample.context = context["text"]
        sample.context_tokens = context["tokens"]
        sample.context_tokens_enc = enc_obj2bytes(context["tokens"])
        sample.context_feature_0 = context["text"]
        sample.context_info_0 = Sample()
        sample.context_info_0.max_features = context["length"]
        # Get PHOC embeddings for OCR tokens
        context_phoc = self.phoc_processor({"tokens": ocr_tokens})
        sample.context_feature_1 = context_phoc["text"]
        sample.context_info_1 = Sample()
        sample.context_info_1.max_features = context_phoc["length"]
        # OCR order vectors
        # TODO remove order_vectors -- it is no longer needed in M4C
        order_vectors = np.eye(len(sample.context_tokens), dtype=np.float32)
        order_vectors = torch.from_numpy(order_vectors)
        order_vectors[context["length"]:] = 0
        sample.order_vectors = order_vectors
        # OCR bounding box information
        if 'ocr_normalized_boxes' in sample_info:
            # New imdb format: OCR bounding boxes are already pre-computed
            max_len = self.config.processors.answer_processor.params.max_length
            sample.ocr_bbox_coordinates = self.copy_processor(
                {"blob":
                 sample_info['ocr_normalized_boxes']})["blob"][:max_len]
        else:
            # Old imdb format: OCR bounding boxes are computed on-the-fly
            # from ocr_info
            sample.ocr_bbox_coordinates = self.bbox_processor(
                {"info": sample_info["ocr_info"]})["bbox"].coordinates
        # sample.iou_info = box_iou(sample.obj_bbox_coordinates, sample.ocr_bbox_coordinates)

        return sample
Example #2
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        if self._dataset_type != "test":
            text_processor_argument = {"tokens": sample_info["caption_tokens"]}
            processed_caption = self.text_processor(text_processor_argument)
            current_sample.text = processed_caption["text"]
            current_sample.caption_id = torch.tensor(sample_info["caption_id"],
                                                     dtype=torch.int)
            current_sample.caption_len = torch.tensor(len(
                sample_info["caption_tokens"]),
                                                      dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add reference captions to sample
        current_sample = self.add_reference_caption(sample_info,
                                                    current_sample)

        return current_sample
Example #3
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        text_processor_argument = {"tokens": sample_info["question_tokens"]}

        processed_question = self.text_processor(text_processor_argument)

        current_sample.text = processed_question["text"]
        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        current_sample.text_len = torch.tensor(len(
            sample_info["question_tokens"]),
                                               dtype=torch.int)

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
Example #4
0
    def add_ocr_details(self, sample_info, sample):
        if self.use_ocr:
            # Preprocess OCR tokens
            ocr_tokens = [
                self.ocr_token_processor({"text": token})["text"]
                for token in sample_info["ocr_tokens"]
            ]
            # Get embeddings for tokens
            context = self.context_processor({"tokens": ocr_tokens})
            sample.context = context["text"]
            sample.context_tokens = context["tokens"]
            sample.context_feature_0 = context["text"]
            sample.context_info_0 = Sample()
            sample.context_info_0.max_features = context["length"]

            order_vectors = torch.eye(len(sample.context_tokens))
            order_vectors[context["length"] :] = 0
            sample.order_vectors = order_vectors

        if self.use_ocr_info and "ocr_info" in sample_info:
            sample.ocr_bbox = self.bbox_processor({"info": sample_info["ocr_info"]})[
                "bbox"
            ]

        return sample
Example #5
0
    def _load_objects(self, idx):
        image_info = self._get_image_info(idx)
        image_height = image_info["height"]
        image_width = image_info["width"]
        object_map = {}
        objects = []

        for obj in image_info["objects"]:
            obj["synsets"] = self.synset_processor({"tokens":
                                                    obj["synsets"]})["text"]
            obj["names"] = self.name_processor({"tokens":
                                                obj["names"]})["text"]
            obj["height"] = obj["h"] / image_height
            obj.pop("h")
            obj["width"] = obj["w"] / image_width
            obj.pop("w")
            obj["y"] /= image_height
            obj["x"] /= image_width
            obj["attributes"] = self.attribute_processor(
                {"tokens": obj["attributes"]})["text"]
            obj = Sample(obj)
            object_map[obj["object_id"]] = obj
            objects.append(obj)
        objects = SampleList(objects)

        return objects, object_map
Example #6
0
    def predict(self, img_paths, qud):
        """
        We enable batch prediction here
        :return:
        """
        with torch.no_grad():
            detectron_features = self.get_detectron_features(
                img_paths)  # a list of image features
            resnet_features = self.get_resnet_features(
                img_paths)  # [batch_size, 196, 2048]

            sample_list = []
            for i in range(len(detectron_features)):
                sample = Sample()
                processed_text = self.vqa_demo.text_processor({"text": qud})
                sample.text = processed_text["text"]
                sample.text_len = len(processed_text["tokens"])

                sample.image_feature_0 = detectron_features[i]
                sample.image_info_0 = Sample(
                    {"max_features": torch.tensor(100, dtype=torch.long)})
                sample.image_feature_1 = resnet_features[i]
                sample_list.append(sample)

            sample_list = SampleList(sample_list)
            sample_list = sample_list.to("cuda")

            scores = self.vqa_demo.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            batch_probs = []
            batch_answers = []

            for i in range(scores.shape[0]):
                top_indices = indices[i]
                top_scores = actual[i]

                probs = []
                answers = []

                for idx, score in enumerate(top_scores):
                    probs.append(score.item())
                    answers.append(
                        self.vqa_demo.answer_processor.idx2word(
                            top_indices[idx].item()))
                batch_probs.append(probs)
                batch_answers.append(answers)

        ## if the memory becomes an issue, we then clear this
        # gc.collect()
        # torch.cuda.empty_cache()

        # list is of batch_size
        # [[ans_1, ans_2], [ans_1, ans2]]
        return batch_probs, batch_answers
Example #7
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        sample_info = self.preprocess_sample_info(sample_info)
        current_sample = Sample()

        # breaking change from VQA2Dataset: load question_id
        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = str(sample_info["image_id"])
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        current_sample = self.add_sample_details(sample_info, current_sample)
        current_sample = self.add_answer_info(sample_info, current_sample)
        current_sample = self.add_anchor_graph(sample_info, current_sample)

        # only the 'max_features' key is needed
        # pop other keys to minimize data loading overhead
        for k in list(current_sample.image_info_0):
            if k != 'max_features':
                current_sample.image_info_0.pop(k)
        for k in list(current_sample.image_info_1):
            if k != 'max_features':
                current_sample.image_info_1.pop(k)

        return current_sample
Example #8
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        # breaking change from VQA2Dataset: load question_id
        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        current_sample = self.add_sample_details(sample_info, current_sample)
        current_sample = self.add_answer_info(sample_info, current_sample)
        current_sample['obj_ocr_edge_feat'] = torch.from_numpy(
            current_sample.image_info_2['obj_ocr_edge_feat']).float(
            )  # [100, 50, 5]
        current_sample['ocr_obj_edge_feat'] = torch.from_numpy(
            current_sample.image_info_2['ocr_obj_edge_feat']).float(
            )  # [50, 100, 5]
        try:
            current_sample['gt_answers'] = sample_info['valid_answers']
        except:
            current_sample['gt_answers'] = ['valid_answers']
        return current_sample
Example #9
0
    def test_forward(self):
        model_config = self.config.model_attributes.cnn_lstm

        cnn_lstm = CNNLSTM(model_config)
        cnn_lstm.build()
        cnn_lstm.init_losses_and_metrics()

        self.assertTrue(isinstance(cnn_lstm, torch.nn.Module))

        test_sample = Sample()
        test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long)
        test_sample.image = torch.randn(3, 320, 480)
        test_sample.targets = torch.randn(32)

        test_sample_list = SampleList([test_sample])
        test_sample_list.dataset_type = "train"
        test_sample_list.dataset_name = "clevr"
        output = cnn_lstm(test_sample_list)

        scores = output["scores"]
        loss = output["losses"]["train/clevr/logit_bce"]
        accuracy = output["metrics"]["train/clevr/accuracy"]

        np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4)
        np.testing.assert_almost_equal(accuracy.item(), 0)
        self.assertEqual(scores.size(), torch.Size((1, 32)))

        expected_scores = [
            -0.7598285675048828, -0.07029829174280167, -0.20382611453533173,
            -0.06990239024162292, 0.7965695858001709, 0.4730074405670166,
            -0.30569902062416077, 0.4244227707386017, 0.6511023044586182,
            0.2480515092611313, -0.5087617635726929, -0.7675772905349731,
            0.4361543357372284, 0.0018743239343166351, 0.6774630546569824,
            0.30618518590927124, -0.398895800113678, -0.13120117783546448,
            -0.4433199465274811, -0.25969570875167847, 0.6798790097236633,
            -0.34090861678123474, 0.0384102463722229, 0.2484571784734726,
            0.0456063412129879, -0.428459107875824, -0.026385333389043808,
            -0.1570669412612915, -0.2377825379371643, 0.3231588304042816,
            0.21098048985004425, -0.712349534034729
        ]

        np.testing.assert_almost_equal(scores[0].tolist(),
                                       expected_scores,
                                       decimal=5)
Example #10
0
    def add_ocr_details(self, sample_info, sample):
        assert self.use_ocr and self.use_ocr_info, \
            'use_ocr and use_ocr_info must be both True for Dataset'
        # Preprocess OCR tokens
        ocr_tokens = [
            self.ocr_token_processor({"text": token})["text"]
            for token in sample_info["ocr_tokens"]
        ]
        # Get FastText embeddings for tokens
        context = self.context_processor({"tokens": ocr_tokens})
        sample.context = context["text"]  # torch.Size([50, 300])
        sample.context_tokens = context["tokens"]
        sample.context_tokens_enc = enc_obj2bytes(context["tokens"])
        sample.context_feature_0 = context["text"]
        sample.context_info_0 = Sample()
        sample.context_info_0.max_features = context["length"]
        # Get PHOC embeddings for OCR tokens
        context_phoc = self.phoc_processor({"tokens": ocr_tokens})
        sample.context_phoc = context_phoc["text"]
        sample.context_info_phoc = Sample()
        sample.context_info_phoc.max_features = context_phoc["length"]

        # if 'ocr_normalized_boxes' in sample_info:
        #     max_len = self.config.processors.answer_processor.params.max_length
        #     sample.ocr_bbox = self.copy_processor(
        #         {"blob": sample_info['ocr_normalized_boxes']}
        #     )["blob"][:max_len]
        if "ocr_info" in sample_info:
            sample.ocr_bbox = self.bbox_processor({
                "info":
                sample_info["ocr_info"],
                "feats":
                context["text"],
                "img_id":
                sample.image_id,
                "obj_bbox":
                sample.obj_bbox
            })["bbox"]

        return sample
Example #11
0
    def predict(self, url, feat_name, get_features=False):
        with torch.no_grad():
            detectron_features = get_detectron_features([url],
                                                        self.detection_model,
                                                        False, feat_name,
                                                        self.cuda_device)
            # returns a single-element list
            detectron_features = detectron_features[0]

            sample = Sample()
            sample.dataset_name = "coco"
            sample.dataset_type = "test"
            sample.image_feature_0 = detectron_features
            sample.answers = torch.zeros((5, 10), dtype=torch.long)

            sample_list = SampleList([sample])
            sample_list = sample_list.to(self.cuda_device)

            tokens = self.caption_model(sample_list)["captions"]

        gc.collect()
        torch.cuda.empty_cache()

        if not get_features:
            return tokens
        else:
            return tokens, detectron_features
Example #12
0
    def test_nucleus_sampling(self):
        vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES)

        model_config = self.config.model_attributes.butd
        model = TestDecoderModel(model_config, vocab)
        model.build()
        model.to("cuda")
        model.eval()

        sample = Sample()
        sample.dataset_name = "coco"
        sample.dataset_type = "test"
        sample.image_feature_0 = torch.randn(100, 2048)
        sample.answers = torch.zeros((5, 10), dtype=torch.long)
        sample_list = SampleList([sample])

        tokens = model(sample_list)["captions"]

        # these are expected tokens for sum_threshold = 0.5
        expected_tokens = [
            1.0000e+00, 2.9140e+03, 5.9210e+03, 2.2040e+03, 5.0550e+03,
            9.2240e+03, 4.5120e+03, 1.8200e+02, 3.6490e+03, 6.4090e+03,
            2.0000e+00
        ]

        self.assertEqual(tokens[0].tolist(), expected_tokens)
Example #13
0
def build_bbox_tensors(infos, max_length):
    num_bbox = min(max_length, len(infos))

    # After num_bbox, everything else should be zero
    coord_tensor = torch.zeros((max_length, 4), dtype=torch.float)
    width_tensor = torch.zeros(max_length, dtype=torch.float)
    height_tensor = torch.zeros(max_length, dtype=torch.float)
    bbox_types = ["xyxy"] * max_length

    infos = infos[:num_bbox]
    sample = Sample()

    for idx, info in enumerate(infos):
        bbox = info["bounding_box"]
        x = bbox["top_left_x"]
        y = bbox["top_left_y"]
        width = bbox["width"]
        height = bbox["height"]

        coord_tensor[idx][0] = x
        coord_tensor[idx][1] = y
        coord_tensor[idx][2] = x + width
        coord_tensor[idx][3] = y + height

        width_tensor[idx] = width
        height_tensor[idx] = height
    sample.coordinates = coord_tensor
    sample.width = width_tensor
    sample.height = height_tensor
    sample.bbox_types = bbox_types

    return sample
Example #14
0
    def forward(self, images, image_scales, transitions=None):
        feature_list = self.encoder(images, image_scales)
        image_features = feature_list[0]
        assert len(
            feature_list) == 1, 'current model only support batch size 1'

        sample = Sample()
        sample.dataset_name = "coco"
        sample.dataset_type = "test"
        sample.image_feature_0 = image_features
        # it seems answers work as a place holder here
        # hence, it does not matter what it's size is
        sample.answers = torch.zeros((1, 10), dtype=torch.long)
        sample_list = SampleList([sample])
        sample_list = sample_list.to(device)
        # set_trace()
        if transitions is not None:
            sample_list.transitions = transitions

        output = self.decoder(sample_list)
        tokens = output['captions']
        caption = tokens.tolist()[0]
        caption = self.decoder.caption_processor(caption)['caption']

        return caption
Example #15
0
    def test_caption_bleu4(self):
        path = os.path.join(
            os.path.abspath(__file__),
            "../../../pythia/common/defaults/configs/datasets/captioning/coco.yml",
        )
        with open(os.path.abspath(path)) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        config = ConfigNode(config)
        captioning_config = config.dataset_attributes.coco
        caption_processor_config = captioning_config.processors.caption_processor
        vocab_path = os.path.join(os.path.abspath(__file__), "..", "..",
                                  "data", "vocab.txt")
        caption_processor_config.params.vocab.vocab_file = os.path.abspath(
            vocab_path)
        caption_processor = CaptionProcessor(caption_processor_config.params)
        registry.register("coco_caption_processor", caption_processor)

        caption_bleu4 = metrics.CaptionBleu4Metric()
        expected = Sample()
        predicted = dict()

        # Test complete match
        expected.answers = torch.empty((5, 5, 10))
        expected.answers.fill_(4)
        predicted["scores"] = torch.zeros((5, 10, 19))
        predicted["scores"][:, :, 4] = 1.0

        self.assertEqual(
            caption_bleu4.calculate(expected, predicted).item(), 1.0)

        # Test partial match
        expected.answers = torch.empty((5, 5, 10))
        expected.answers.fill_(4)
        predicted["scores"] = torch.zeros((5, 10, 19))
        predicted["scores"][:, 0:5, 4] = 1.0

        self.assertAlmostEqual(
            caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4)
Example #16
0
    def test_forward(self):
        model_config = self.config.model_attributes.cnn_lstm

        cnn_lstm = CNNLSTM(model_config)
        cnn_lstm.build()
        cnn_lstm.init_losses_and_metrics()

        self.assertTrue(isinstance(cnn_lstm, torch.nn.Module))

        test_sample = Sample()
        test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long)
        test_sample.image = torch.randn(3, 320, 480)
        test_sample.targets = torch.randn(32)

        test_sample_list = SampleList([test_sample])
        test_sample_list.dataset_type = "train"
        test_sample_list.dataset_name = "clevr"
        output = cnn_lstm(test_sample_list)

        scores = output["scores"]
        loss = output["losses"]["train/logit_bce"]
        accuracy = output["metrics"]["train/accuracy"]

        np.testing.assert_almost_equal(loss.item(), 23.4751, decimal=4)
        np.testing.assert_almost_equal(accuracy.item(), 0)
        self.assertEqual(scores.size(), torch.Size((1, 32)))

        expected_scores = [
            2.2298e-02, -2.4975e-01, -1.1960e-01, -5.0868e-01, -9.3013e-02,
            1.3202e-02, -1.7536e-01, -3.1180e-01, 1.5369e-01, 1.4900e-01,
            1.9006e-01, -1.9457e-01, 1.4924e-02, -1.1032e-01, 1.3777e-01,
            -3.6255e-01, -2.9327e-01, 5.6247e-04, -4.8732e-01, 4.0949e-01,
            -1.1069e-01, 2.9696e-01, 4.1903e-02, 6.7062e-02, 7.0094e-01,
            -1.9898e-01, -2.9502e-03, -3.9040e-01, 1.2218e-01, 3.7895e-02,
            2.4472e-02, 1.7213e-01
        ]
        np.testing.assert_almost_equal(scores[0].tolist(),
                                       expected_scores,
                                       decimal=5)
Example #17
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		current_sample.qa_id = data['qa_id']

		# process question
		question = data["question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['answer']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index
		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		if self.config.spatial:
			point = data['point']
			# current_sample.point = point
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		return current_sample
Example #18
0
    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample({
                "max_features": torch.tensor(100, dtype=torch.long)
            })

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[idx].item())
                )

        gc.collect()
        torch.cuda.empty_cache()
        return probs, answers
Example #19
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        sample_info = self.preprocess_sample_info(sample_info)
        current_sample = Sample()

        # breaking change from VQA2Dataset: load question_id
        current_sample.question_id = torch.tensor(
            sample_info["question_id"], dtype=torch.int
        )

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = str(sample_info["image_id"])
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        current_sample = self.add_sample_details(sample_info, current_sample)
        current_sample = self.add_answer_info(sample_info, current_sample)

        # only the 'max_features' key is needed
        # pop other keys to minimize data loading overhead
        for k in list(current_sample.image_info_0):
            if k != 'max_features':
                current_sample.image_info_0.pop(k)
        for k in list(current_sample.image_info_1):
            if k != 'max_features':
                current_sample.image_info_1.pop(k)
        overlap_flag = torch.zeros(150, 150)
        obj_obj_relation = self.compute_similarity_by_cosine(current_sample.image_feature_0, current_sample.image_feature_0)
        ocr_ocr_relation = self.compute_similarity_by_cosine(current_sample.context_feature_0, current_sample.context_feature_0)
        obj_ocr_relation = self.overlap(current_sample.obj_bbox_coordinates, current_sample.ocr_bbox_coordinates)
        overlap_flag[:100, :100] = obj_obj_relation
        overlap_flag[100:, 100:] = ocr_ocr_relation
        overlap_flag[:100, 100:] = obj_ocr_relation
        overlap_flag[100:, :100] = obj_ocr_relation.transpose(1, 0)
        current_sample.overlap_flag = overlap_flag
        return current_sample
Example #20
0
    def _load_regions(self, idx, object_map, relationship_map):
        if self._return_scene_graph is None:
            return None, None

        image_info = self._get_image_info(idx)
        image_height = image_info["height"]
        image_width = image_info["width"]
        region_map = {}
        regions = []

        for region in image_info["regions"]:
            for synset in region["synsets"]:
                synset["entity_name"] = self.name_processor(
                    {"tokens": [synset["entity_name"]]})["text"]
                synset["synset_name"] = self.synset_processor(
                    {"tokens": [synset["synset_name"]]})["text"]

            region["height"] /= image_height
            region["width"] /= image_width
            region["y"] /= image_height
            region["x"] /= image_width

            relationships = []
            objects = []

            for relationship_idx in region["relationships"]:
                relationships.append(relationship_map[relationship_idx])

            for object_idx in region["objects"]:
                objects.append(object_map[object_idx])

            region["relationships"] = relationships
            region["objects"] = objects
            region["phrase"] = self.text_processor({"text":
                                                    region["phrase"]})["text"]

            region = Sample(region)
            region_map[region["region_id"]] = region
            regions.append(region)

        regions = SampleList(regions)
        return regions, region_map
Example #21
0
    def _load_relationships(self, idx, object_map):
        if self._return_relationships is None and self._return_scene_graph is None:
            return None, None

        image_info = self._get_image_info(idx)
        relationship_map = {}
        relationships = []

        for relationship in image_info["relationships"]:
            relationship["synsets"] = self.synset_processor(
                {"tokens": relationship["synsets"]})["text"]
            relationship["predicate"] = self.predicate_processor(
                {"tokens": relationship["predicate"]})["text"]
            relationship["object"] = object_map[relationship["object_id"]]
            relationship["subject"] = object_map[relationship["subject_id"]]

            relationship = Sample(relationship)
            relationship_map[relationship["relationship_id"]] = relationship
            relationships.append(relationship)

        relationships = SampleList(relationships)
        return relationships, relationship_map
Example #22
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        # breaking change from VQA2Dataset: load question_id
        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        current_sample = self.add_sample_details(sample_info, current_sample)
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
Example #23
0
    def predict(self, url):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)

            sample = Sample()
            sample.dataset_name = "coco"
            sample.dataset_type = "test"
            sample.image_feature_0 = detectron_features
            sample.answers = torch.zeros((5, 10), dtype=torch.long)

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            tokens = self.pythia_model(sample_list)["captions"]

        gc.collect()
        torch.cuda.empty_cache()

        return tokens
Example #24
0
    def get_item(self, idx):
        data = self.questions[idx]

        # Each call to get_item from dataloader returns a Sample class object which
        # collated by our special batch collator to a SampleList which is basically
        # a attribute based batch in layman terms
        current_sample = Sample()

        question = data["question"]
        tokens = tokenize(question, keep=[";", ","], remove=["?", "."])
        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        processed = self.answer_processor({"answers": [data["answer"]]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"]

        image_path = os.path.join(self.image_path, data["image_filename"])
        image = np.true_divide(Image.open(image_path).convert("RGB"), 255)
        image = image.astype(np.float32)
        current_sample.image = torch.from_numpy(image.transpose(2, 0, 1))

        return current_sample
Example #25
0
 def load_item(self, idx):
     sample = Sample()
     image_id = self.annotations[idx][0]
     image_folder = image_id.split('_')[0]
     caption = self.annotations[idx][1]
     tokens = tokenize(caption)
     tokens = ['<s>'] + tokens + ['</s>']
     # use text_processor to process caption
     # pad sequence, convert token to indices and add SOS, EOS token
     # text_processor already contains a pre-processor to tokenize caption
     caption_p = self.text_processor({'tokens': tokens})
     sample.text = caption_p['text']
     sample.caption_len = torch.tensor(len(tokens), dtype=torch.int)
     # sample.target = caption_p['text']
     sample.answers = torch.stack([caption_p['text']])
     # generate image features
     image_path = os.path.join(self.image_dir, image_folder, image_id)
     image, image_scale = self._image_transform(image_path)
     with torch.no_grad():
         image_features = self.feature_extractor([image], [image_scale])
     image_features = image_features[0]
     sample.image_feature_0 = image_features.cpu()
     return sample
Example #26
0
def build_bbox_tensors(infos, max_length, feats, img_id, obj_bbox):

    # num of ocr bbox
    num_bbox = min(max_length, len(infos))
    # ocr bbox
    coord_tensor = torch.zeros((max_length, 4), dtype=torch.float)
    infos = infos[:num_bbox]
    sample = Sample()

    for idx, info in enumerate(infos):
        bbox = info["bounding_box"]
        if "top_left_x" in bbox:
            x = bbox["top_left_x"]  # key might be 'topLeftX'
            y = bbox["top_left_y"]  # key might be 'topLeftY'
        else:
            x = bbox["topLeftX"]
            y = bbox["topLeftY"]
        width = bbox["width"]
        height = bbox["height"]
        coord_tensor[idx][0] = x
        coord_tensor[idx][1] = y
        coord_tensor[idx][2] = x + width
        coord_tensor[idx][3] = y + height

    sample.coordinates = coord_tensor
    sample.ocr_mask = num_bbox

    image_path_org = './data/open_images/textvqa_gcy/'
    # image_path_org = './data/open_images/GT_OBJ_FRCN/'
    # image_path_org = './data/open_images/visual_genome/'

    oo_edge_path = image_path_org + 'edge_oo/'
    ot_edge_path = image_path_org + 'edge_ot/'
    tt_edge_path = image_path_org + 'edge_tt/'
    to_edge_path = image_path_org + 'edge_to/'

    set_name = search_file(image_path_org, img_id)
    knn_k = 5

    try:
        oo_node_matrix = torch.load(oo_edge_path + img_id + '_oo.pdh')
        sample.edge_oo = oo_node_matrix
        oo_feats = torch.load(oo_edge_path + img_id + '_oofeats.pdh')
        sample.edge_oofeats = oo_feats

        ot_node_matrix = torch.load(ot_edge_path + img_id + '_ot.pdh')
        sample.edge_ot = ot_node_matrix
        ot_feats = torch.load(ot_edge_path + img_id + '_otfeats.pdh')
        sample.edge_otfeats = ot_feats

        tt_node_matrix = torch.load(tt_edge_path + img_id + '_tt.pdh')
        sample.edge_tt = tt_node_matrix
        tt_feats = torch.load(tt_edge_path + img_id + '_ttfeats.pdh')
        sample.edge_ttfeats = tt_feats

        to_node_matrix = torch.load(to_edge_path + img_id + '_to.pdh')
        sample.edge_to = to_node_matrix
        to_feats = torch.load(to_edge_path + img_id + '_tofeats.pdh')
        sample.edge_tofeats = to_feats
    except:
        #Todo: generate obj-obj relation edge
        oo_node_matrix = finde_k_nearest_node(obj_bbox, knn_k)
        sample.edge_oo = oo_node_matrix
        oo_edge_file_name = oo_edge_path + img_id + "_oo.pdh"
        torch.save(oo_node_matrix, oo_edge_file_name)

        obj_obj_feat_variable = gen_oo_edge_feature(obj_bbox,
                                                    oo_node_matrix,
                                                    knn_k=knn_k)
        oo_edge_file_name = oo_edge_path + img_id + "_oofeats.pdh"
        torch.save(obj_obj_feat_variable, oo_edge_file_name)
        sample.edge_oofeats = obj_obj_feat_variable

        #Todo: generate object-text relation edge
        ot_node_matrix = dc_finde_k_nearest_node(obj_bbox, coord_tensor, knn_k)
        sample.edge_ot = ot_node_matrix
        ot_edge_file_name = ot_edge_path + img_id + "_ot.pdh"
        torch.save(ot_node_matrix, ot_edge_file_name)

        obj_text_feat_variable = gen_ot_edge_feature(obj_bbox,
                                                     coord_tensor,
                                                     ot_node_matrix,
                                                     knn_k=knn_k)
        ot_edge_file_name = ot_edge_path + img_id + "_otfeats.pdh"
        torch.save(obj_text_feat_variable, ot_edge_file_name)
        sample.edge_otfeats = obj_text_feat_variable

        #Todo: generate text-text relation edge
        tt_node_matrix = finde_k_nearest_node(coord_tensor, knn_k)
        sample.edge_tt = tt_node_matrix
        tt_edge_file_name = tt_edge_path + img_id + "_tt.pdh"
        torch.save(tt_node_matrix, tt_edge_file_name)

        text_text_edge_feature = gen_tt_edge_feature(coord_tensor,
                                                     tt_node_matrix,
                                                     knn_k=knn_k)
        tt_edge_file_name = tt_edge_path + img_id + "_ttfeats.pdh"
        torch.save(text_text_edge_feature, tt_edge_file_name)
        sample.edge_ttfeats = text_text_edge_feature

        #Todo: generate text-obj relation edge
        to_node_matrix = dc_finde_k_nearest_node(coord_tensor, obj_bbox, knn_k)
        sample.edge_to = to_node_matrix
        to_edge_file_name = to_edge_path + img_id + "_to.pdh"
        torch.save(to_node_matrix, to_edge_file_name)

        text_obj_feat_variable = gen_to_edge_feature(coord_tensor,
                                                     obj_bbox,
                                                     to_node_matrix,
                                                     knn_k=knn_k)
        to_edge_file_name = to_edge_path + img_id + "_tofeats.pdh"
        torch.save(text_obj_feat_variable, to_edge_file_name)
        sample.edge_tofeats = text_obj_feat_variable

    return sample
Example #27
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()
        current_sample.dataset_name = self.dataset

        if self.dataset == 'train_vqa':

            text_processor_argument = {
                "tokens": sample_info["question_tokens"]
            }
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text_len = torch.tensor(len(
                sample_info["question_tokens"]),
                                                   dtype=torch.int)
            current_sample.text = processed_question["text"]
            current_sample.question_text = sample_info["question_str"]
            current_sample.text_sq = current_sample.text
            current_sample.text_oq = current_sample.text
            current_sample.reasoning_question = sample_info["question_str"]
            current_sample.reasoning_answer = sample_info["answers"][0]
            current_sample.sub_question = sample_info["question_str"]
            current_sample.other_question = sample_info["question_str"]

        elif self.dataset == 'train_introspect' or self.dataset == 'test':

            text_processor_argument = {
                "text": sample_info["main_question_str"]
            }
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text = processed_question["text"]
            if "sub_question_str" in sample_info:
                text_processor_argument_sq = {
                    "text": sample_info["sub_question_str"]
                }
                processed_question_sq = self.text_processor(
                    text_processor_argument_sq)
                current_sample.text_sq = processed_question_sq["text"]

            if "other_question_str" in sample_info:
                text_processor_argument_oq = {
                    "text": sample_info["other_question_str"]
                }
                processed_question_oq = self.text_processor(
                    text_processor_argument_oq)
                current_sample.text_oq = processed_question_oq["text"]

            current_sample.question_text = sample_info["main_question_str"]
            current_sample.reasoning_question = sample_info[
                "main_question_str"]
            current_sample.reasoning_answer = sample_info["main_answer_str"][0]
            current_sample.sub_question = sample_info["sub_question_str"]
            current_sample.other_question = sample_info["other_question_str"]
            current_sample.text_len = torch.tensor(len(
                sample_info["main_question_tokens"]),
                                                   dtype=torch.int)

        else:

            text_processor_argument = {"text": sample_info["question_str"]}
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text = processed_question["text"]
            if "sub_question_str" in sample_info:
                text_processor_argument_sq = {
                    "text": sample_info["sub_question_str"]
                }
                processed_question_sq = self.text_processor(
                    text_processor_argument_sq)
                current_sample.text_sq = processed_question_sq["text"]

            if "other_question_str" in sample_info:
                text_processor_argument_oq = {
                    "text": sample_info["other_question_str"]
                }
                processed_question_oq = self.text_processor(
                    text_processor_argument_oq)
                current_sample.text_oq = processed_question_oq["text"]
            else:
                current_sample.text_oq = current_sample.text_sq

            current_sample.question_text = sample_info["question_str"]
            current_sample.reasoning_question = sample_info["question_str"]
            current_sample.reasoning_answer = sample_info["answers"][0]
            current_sample.sub_question = sample_info["sub_question_str"]
            current_sample.other_question = sample_info["sub_question_str"]
            current_sample.text_len = torch.tensor(len(
                sample_info["question_tokens"]),
                                                   dtype=torch.int)

        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
Example #28
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		# current_sample.qa_id = data['qa_id']

		# store points
		current_sample.point = data['point'] # data['points']
		bbox = data['bbox']
		current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']])

		# process question
		question = data["pt_question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['ans']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index

		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		point = data['point'] # point = data['points'][0]
		if 'pt' in self.detectron_folder:
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		# read in bounding boxes (hardcoded for now)
		
		bbox_path = ''
		bbox_path  += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.pt_bbox = bboxes

		# read in image bounding boxes
		bbox_path = ''
		bbox_path  += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.img_bbox = bboxes
		
		# Context features --------------------
		if self.config.use_context:
			context_path = self.context_folder + str(data['id'])
			context_path += ',' + str(point['x']) + ',' + str(point['y'])
			context_path += '.pt'

			context_feat = torch.load(context_path, map_location=torch.device('cpu'))
			context_feat = context_feat.squeeze()
			orig_dim = context_feat.shape[0]

			if self.config.pad_context:
				if context_feat.shape[0] > 100:
					context_feat = context_feat[:100]
				elif context_feat.shape[0] < 100:
					pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1])
					context_feat = torch.cat([context_feat, pad], dim=0)

			current_sample.context_feature_0 = context_feat
		# ---------------------------------------------

		return current_sample
Example #29
0
    def evaluate_full(self, loader, use_tqdm=False):
        
        meter = Meter()

        # metrics = ['vqamb_map', 'vqamb_f1'] # hardcode metrics for now
        metrics = ['accuracy']
        # metrics = ['vqamb_f1pt']

        print(len(loader))
        
        with torch.no_grad():
            self.model.eval()
            tot_preds = []
            tot_targets = []
            tot_ids = []
            tot_att_pt = []
            tot_att_img = []
            tot_bbox_gt = []
            tot_bbox_pt = []
            tot_bbox_img = []
            tot_part = []
            # tot_qa_ids = []
            for batch in tqdm(loader, disable=not use_tqdm):
                report = self._forward_pass(batch)
                tot_preds.append(report.scores)
                tot_targets.append(report.targets)
                # tot_ids.extend(report.qa_id)
                # tot_att_pt.append(report.att)
                # tot_att_img.append(report.att_img)
                # tot_bbox_gt.append(report.gt_bbox)
                # tot_bbox_img.append(report.img_bbox)
                # tot_bbox_pt.append(report.pt_bbox)
                # tot_part.append(report.part)
                # tot_bbox_gt.append(report.gt_bbox)
                # tot_ptpath.append(report.ptpath)
                # tot_bbox_pt.append(report.bboxes)
                # tot_bbox_gt.append(report.gt_bbox)
                # tot_qa_ids.extend(report.qa_id)
                
            tot_preds = torch.cat(tot_preds, dim=0)
            tot_targets = torch.cat(tot_targets, dim=0)
            # tot_att_pt = torch.cat(tot_att_pt, dim=0)
            # tot_att_img = torch.cat(tot_att_img, dim=0)
            # tot_att_pt = torch.cat(tot_att_pt, dim=0)
            # tot_bbox_pt = torch.cat(tot_bbox_pt, dim=0)
            # tot_bbox_gt = torch.cat(tot_bbox_gt, dim=0)
            # tot_bbox_img = torch.cat(tot_bbox_img, dim=0)
            # Find bounding box with max attention
            
            # max_att_pt = tot_att_pt.argmax(dim=1)
            # max_bbox_pt = tot_bbox_pt[torch.arange(tot_bbox_pt.size(0)), max_att_pt]
            '''
            torch.save(tot_att_pt, 'tot_pt_att_objpartdev.pt')
            torch.save(tot_bbox_pt, 'tot_ptbboxes_objpartdev.pt')
            tot_part = sum(tot_part, [])
            torch.save(torch.Tensor(tot_part), 'tot_part_objpartdev.pt')
            '''
            # torch.save(tot_att_pt, 'tot_att_pt_localqafinal.pt')
            # torch.save(tot_att_img, 'tot_att_img_pythiaptfinal.pt')
            # torch.save(tot_bbox_pt, 'tot_bbox_pt_localqafinal.pt')
            # torch.save(tot_bbox_img, 'tot_bbox_img_pythia_ptfinal.pt')
            # torch.save(tot_bbox_gt, 'tot_bboxgt_localqafinal.pt')
            # torch.save(tot_preds, 'tot_preds_localqafinal.pt')
            # torch.save(tot_targets, 'tot_targets_localqafinal.pt')
            
            # torch.save(max_bbox_pt, 'max_pt_bbox_pythiaptfinal.pt')
            # torch.save(tot_bbox_gt, 'gt_bbox_pythiaptfinal.pt')
            
            # torch.save(tot_preds, 'tot_preds_localqa.pt')
            # torch.save(tot_targets, 'tot_targets_localqa.pt')
            # torch.save(tot_ptpath, 'tot_ptpath_vqambnew.pt')
            # torch.save(tot_att, 'tot_att_vqambnew.pt')
            # tot_qa_ids = torch.Tensor(tot_qa_ids)
            # torch.save(tot_qa_ids, 'tot_qa_ids.pt')

            model_output = {"scores": tot_preds}
            sample = Sample({"targets": tot_targets}) # "qa_index": tot_qa_index}) # "dataset_type": report.dataset_type, "dataset_name": report.dataset_name})
            sample_list = SampleList([sample])
            sample_list.add_field('dataset_type', report.dataset_type)
            sample_list.add_field('dataset_name', report.dataset_name)

            metric_fn = Metrics(metrics)
            full_met = metric_fn(sample_list, model_output)
            self.writer.write(full_met)

            if report.dataset_type == 'test':
                return
            
            meter.update(full_met)
            stop = self.early_stopping(self.current_iteration, meter)

            should_break = False
            if stop is True:
                self.writer.write("Early stopping activated")
                should_break = True
            
            self.model.train()

        return should_break
Example #30
0
    def getAnswers(self, image, question, meta=None):

        first = time.time()
        meta = meta or str(image)
        image = Image.open(image).convert('RGB') if isinstance(image, str) else \
                image.convert('RGB')

        print(f'Tiki : Getting Answers : {meta}, {question}')

        with torch.no_grad():

            detectron_features = self.get_detectron_features(image)
            resnet152_features = self.get_resnet152_features(image)

            start = time.time()
            sample = Sample()

            processed_text = self.text_processor({'text': question})
            sample.text = processed_text['text']
            sample.text_len = len(processed_text['tokens'])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {'max_features': torch.tensor(100, dtype=torch.long)})

            sample.image_feature_1 = resnet152_features

            sample_list = SampleList([sample])

            sample_list = sample_list.to(self.device.type)

            scores = self.pythiaVQA_model(sample_list)['scores']
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            answers = []

            for rank, score in enumerate(top_scores):
                answers.append({
                    'rank':
                    rank,
                    'answer':
                    self.answer_processor.idx2word(top_indices[rank].item()),
                    'probability':
                    score.item()
                })

            answer = answers[0]['answer']

            end = time.time()

        print(
            f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds'
        )

        processing['PythiaVQA'] = end - start

        gc.collect()

        torch.cuda.empty_cache()

        last = time.time()

        processing['InferTime'] = last - first

        return question, answer, answers