コード例 #1
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        text_processor_argument = {"tokens": sample_info["question_tokens"]}

        processed_question = self.text_processor(text_processor_argument)

        current_sample.text = processed_question["text"]
        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        current_sample.text_len = torch.tensor(len(
            sample_info["question_tokens"]),
                                               dtype=torch.int)

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
コード例 #2
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        if self._dataset_type != "test":
            text_processor_argument = {"tokens": sample_info["caption_tokens"]}
            processed_caption = self.text_processor(text_processor_argument)
            current_sample.text = processed_caption["text"]
            current_sample.caption_id = torch.tensor(sample_info["caption_id"],
                                                     dtype=torch.int)
            current_sample.caption_len = torch.tensor(len(
                sample_info["caption_tokens"]),
                                                      dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add reference captions to sample
        current_sample = self.add_reference_caption(sample_info,
                                                    current_sample)

        return current_sample
コード例 #3
0
    def predict(self, img_paths, qud):
        """
        We enable batch prediction here
        :return:
        """
        with torch.no_grad():
            detectron_features = self.get_detectron_features(
                img_paths)  # a list of image features
            resnet_features = self.get_resnet_features(
                img_paths)  # [batch_size, 196, 2048]

            sample_list = []
            for i in range(len(detectron_features)):
                sample = Sample()
                processed_text = self.vqa_demo.text_processor({"text": qud})
                sample.text = processed_text["text"]
                sample.text_len = len(processed_text["tokens"])

                sample.image_feature_0 = detectron_features[i]
                sample.image_info_0 = Sample(
                    {"max_features": torch.tensor(100, dtype=torch.long)})
                sample.image_feature_1 = resnet_features[i]
                sample_list.append(sample)

            sample_list = SampleList(sample_list)
            sample_list = sample_list.to("cuda")

            scores = self.vqa_demo.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            batch_probs = []
            batch_answers = []

            for i in range(scores.shape[0]):
                top_indices = indices[i]
                top_scores = actual[i]

                probs = []
                answers = []

                for idx, score in enumerate(top_scores):
                    probs.append(score.item())
                    answers.append(
                        self.vqa_demo.answer_processor.idx2word(
                            top_indices[idx].item()))
                batch_probs.append(probs)
                batch_answers.append(answers)

        ## if the memory becomes an issue, we then clear this
        # gc.collect()
        # torch.cuda.empty_cache()

        # list is of batch_size
        # [[ans_1, ans_2], [ans_1, ans2]]
        return batch_probs, batch_answers
コード例 #4
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		current_sample.qa_id = data['qa_id']

		# process question
		question = data["question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['answer']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index
		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		if self.config.spatial:
			point = data['point']
			# current_sample.point = point
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		return current_sample
コード例 #5
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # 1. Load text (question words)
        # breaking change from VQA2Dataset:
        # load the entire question string, not tokenized questions, since we
        # switch to BERT tokenizer in M4C and do online tokenization
        question_str = (sample_info['question'] if 'question' in sample_info
                        else sample_info['question_str'])
        processed_question = self.text_processor({"question": question_str})
        current_sample.text = processed_question['token_inds']
        current_sample.text_len = processed_question['token_num']

        # 2. Load object
        # object bounding box information
        current_sample.obj_bbox = self.copy_processor(
            {"blob": sample_info["obj_normalized_boxes"]})["bbox"]

        current_sample.obj_mask = min(
            self.copy_processor({"blob":
                                 sample_info["obj_normalized_boxes"]})["mask"],
            36)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
コード例 #6
0
ファイル: test_cnn_lstm.py プロジェクト: wzk1015/CNMT
    def test_forward(self):
        model_config = self.config.model_attributes.cnn_lstm

        cnn_lstm = CNNLSTM(model_config)
        cnn_lstm.build()
        cnn_lstm.init_losses_and_metrics()

        self.assertTrue(isinstance(cnn_lstm, torch.nn.Module))

        test_sample = Sample()
        test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long)
        test_sample.image = torch.randn(3, 320, 480)
        test_sample.targets = torch.randn(32)

        test_sample_list = SampleList([test_sample])
        test_sample_list.dataset_type = "train"
        test_sample_list.dataset_name = "clevr"
        output = cnn_lstm(test_sample_list)

        scores = output["scores"]
        loss = output["losses"]["train/clevr/logit_bce"]
        accuracy = output["metrics"]["train/clevr/accuracy"]

        np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4)
        np.testing.assert_almost_equal(accuracy.item(), 0)
        self.assertEqual(scores.size(), torch.Size((1, 32)))

        expected_scores = [
            -0.7598285675048828, -0.07029829174280167, -0.20382611453533173,
            -0.06990239024162292, 0.7965695858001709, 0.4730074405670166,
            -0.30569902062416077, 0.4244227707386017, 0.6511023044586182,
            0.2480515092611313, -0.5087617635726929, -0.7675772905349731,
            0.4361543357372284, 0.0018743239343166351, 0.6774630546569824,
            0.30618518590927124, -0.398895800113678, -0.13120117783546448,
            -0.4433199465274811, -0.25969570875167847, 0.6798790097236633,
            -0.34090861678123474, 0.0384102463722229, 0.2484571784734726,
            0.0456063412129879, -0.428459107875824, -0.026385333389043808,
            -0.1570669412612915, -0.2377825379371643, 0.3231588304042816,
            0.21098048985004425, -0.712349534034729
        ]

        np.testing.assert_almost_equal(scores[0].tolist(),
                                       expected_scores,
                                       decimal=5)
コード例 #7
0
    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample({
                "max_features": torch.tensor(100, dtype=torch.long)
            })

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[idx].item())
                )

        gc.collect()
        torch.cuda.empty_cache()
        return probs, answers
コード例 #8
0
    def test_forward(self):
        model_config = self.config.model_attributes.cnn_lstm

        cnn_lstm = CNNLSTM(model_config)
        cnn_lstm.build()
        cnn_lstm.init_losses_and_metrics()

        self.assertTrue(isinstance(cnn_lstm, torch.nn.Module))

        test_sample = Sample()
        test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long)
        test_sample.image = torch.randn(3, 320, 480)
        test_sample.targets = torch.randn(32)

        test_sample_list = SampleList([test_sample])
        test_sample_list.dataset_type = "train"
        test_sample_list.dataset_name = "clevr"
        output = cnn_lstm(test_sample_list)

        scores = output["scores"]
        loss = output["losses"]["train/logit_bce"]
        accuracy = output["metrics"]["train/accuracy"]

        np.testing.assert_almost_equal(loss.item(), 23.4751, decimal=4)
        np.testing.assert_almost_equal(accuracy.item(), 0)
        self.assertEqual(scores.size(), torch.Size((1, 32)))

        expected_scores = [
            2.2298e-02, -2.4975e-01, -1.1960e-01, -5.0868e-01, -9.3013e-02,
            1.3202e-02, -1.7536e-01, -3.1180e-01, 1.5369e-01, 1.4900e-01,
            1.9006e-01, -1.9457e-01, 1.4924e-02, -1.1032e-01, 1.3777e-01,
            -3.6255e-01, -2.9327e-01, 5.6247e-04, -4.8732e-01, 4.0949e-01,
            -1.1069e-01, 2.9696e-01, 4.1903e-02, 6.7062e-02, 7.0094e-01,
            -1.9898e-01, -2.9502e-03, -3.9040e-01, 1.2218e-01, 3.7895e-02,
            2.4472e-02, 1.7213e-01
        ]
        np.testing.assert_almost_equal(scores[0].tolist(),
                                       expected_scores,
                                       decimal=5)
コード例 #9
0
    def get_item(self, idx):
        data = self.questions[idx]

        # Each call to get_item from dataloader returns a Sample class object which
        # collated by our special batch collator to a SampleList which is basically
        # a attribute based batch in layman terms
        current_sample = Sample()

        question = data["question"]
        tokens = tokenize(question, keep=[";", ","], remove=["?", "."])
        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        processed = self.answer_processor({"answers": [data["answer"]]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"]

        image_path = os.path.join(self.image_path, data["image_filename"])
        image = np.true_divide(Image.open(image_path).convert("RGB"), 255)
        image = image.astype(np.float32)
        current_sample.image = torch.from_numpy(image.transpose(2, 0, 1))

        return current_sample
コード例 #10
0
 def load_item(self, idx):
     sample = Sample()
     image_id = self.annotations[idx][0]
     image_folder = image_id.split('_')[0]
     caption = self.annotations[idx][1]
     tokens = tokenize(caption)
     tokens = ['<s>'] + tokens + ['</s>']
     # use text_processor to process caption
     # pad sequence, convert token to indices and add SOS, EOS token
     # text_processor already contains a pre-processor to tokenize caption
     caption_p = self.text_processor({'tokens': tokens})
     sample.text = caption_p['text']
     sample.caption_len = torch.tensor(len(tokens), dtype=torch.int)
     # sample.target = caption_p['text']
     sample.answers = torch.stack([caption_p['text']])
     # generate image features
     image_path = os.path.join(self.image_dir, image_folder, image_id)
     image, image_scale = self._image_transform(image_path)
     with torch.no_grad():
         image_features = self.feature_extractor([image], [image_scale])
     image_features = image_features[0]
     sample.image_feature_0 = image_features.cpu()
     return sample
コード例 #11
0
ファイル: tiki.py プロジェクト: psnonis/TikiAI
    def getAnswers(self, image, question, meta=None):

        first = time.time()
        meta = meta or str(image)
        image = Image.open(image).convert('RGB') if isinstance(image, str) else \
                image.convert('RGB')

        print(f'Tiki : Getting Answers : {meta}, {question}')

        with torch.no_grad():

            detectron_features = self.get_detectron_features(image)
            resnet152_features = self.get_resnet152_features(image)

            start = time.time()
            sample = Sample()

            processed_text = self.text_processor({'text': question})
            sample.text = processed_text['text']
            sample.text_len = len(processed_text['tokens'])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {'max_features': torch.tensor(100, dtype=torch.long)})

            sample.image_feature_1 = resnet152_features

            sample_list = SampleList([sample])

            sample_list = sample_list.to(self.device.type)

            scores = self.pythiaVQA_model(sample_list)['scores']
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            answers = []

            for rank, score in enumerate(top_scores):
                answers.append({
                    'rank':
                    rank,
                    'answer':
                    self.answer_processor.idx2word(top_indices[rank].item()),
                    'probability':
                    score.item()
                })

            answer = answers[0]['answer']

            end = time.time()

        print(
            f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds'
        )

        processing['PythiaVQA'] = end - start

        gc.collect()

        torch.cuda.empty_cache()

        last = time.time()

        processing['InferTime'] = last - first

        return question, answer, answers
コード例 #12
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		# current_sample.qa_id = data['qa_id']

		# store points
		current_sample.point = data['point'] # data['points']
		bbox = data['bbox']
		current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']])

		# process question
		question = data["pt_question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['ans']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index

		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		point = data['point'] # point = data['points'][0]
		if 'pt' in self.detectron_folder:
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		# read in bounding boxes (hardcoded for now)
		
		bbox_path = ''
		bbox_path  += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.pt_bbox = bboxes

		# read in image bounding boxes
		bbox_path = ''
		bbox_path  += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.img_bbox = bboxes
		
		# Context features --------------------
		if self.config.use_context:
			context_path = self.context_folder + str(data['id'])
			context_path += ',' + str(point['x']) + ',' + str(point['y'])
			context_path += '.pt'

			context_feat = torch.load(context_path, map_location=torch.device('cpu'))
			context_feat = context_feat.squeeze()
			orig_dim = context_feat.shape[0]

			if self.config.pad_context:
				if context_feat.shape[0] > 100:
					context_feat = context_feat[:100]
				elif context_feat.shape[0] < 100:
					pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1])
					context_feat = torch.cat([context_feat, pad], dim=0)

			current_sample.context_feature_0 = context_feat
		# ---------------------------------------------

		return current_sample
コード例 #13
0
ファイル: dataset.py プロジェクト: sameerdharur/sorting-vqa
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()
        current_sample.dataset_name = self.dataset

        if self.dataset == 'train_vqa':

            text_processor_argument = {
                "tokens": sample_info["question_tokens"]
            }
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text_len = torch.tensor(len(
                sample_info["question_tokens"]),
                                                   dtype=torch.int)
            current_sample.text = processed_question["text"]
            current_sample.question_text = sample_info["question_str"]
            current_sample.text_sq = current_sample.text
            current_sample.text_oq = current_sample.text
            current_sample.reasoning_question = sample_info["question_str"]
            current_sample.reasoning_answer = sample_info["answers"][0]
            current_sample.sub_question = sample_info["question_str"]
            current_sample.other_question = sample_info["question_str"]

        elif self.dataset == 'train_introspect' or self.dataset == 'test':

            text_processor_argument = {
                "text": sample_info["main_question_str"]
            }
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text = processed_question["text"]
            if "sub_question_str" in sample_info:
                text_processor_argument_sq = {
                    "text": sample_info["sub_question_str"]
                }
                processed_question_sq = self.text_processor(
                    text_processor_argument_sq)
                current_sample.text_sq = processed_question_sq["text"]

            if "other_question_str" in sample_info:
                text_processor_argument_oq = {
                    "text": sample_info["other_question_str"]
                }
                processed_question_oq = self.text_processor(
                    text_processor_argument_oq)
                current_sample.text_oq = processed_question_oq["text"]

            current_sample.question_text = sample_info["main_question_str"]
            current_sample.reasoning_question = sample_info[
                "main_question_str"]
            current_sample.reasoning_answer = sample_info["main_answer_str"][0]
            current_sample.sub_question = sample_info["sub_question_str"]
            current_sample.other_question = sample_info["other_question_str"]
            current_sample.text_len = torch.tensor(len(
                sample_info["main_question_tokens"]),
                                                   dtype=torch.int)

        else:

            text_processor_argument = {"text": sample_info["question_str"]}
            processed_question = self.text_processor(text_processor_argument)
            current_sample.text = processed_question["text"]
            if "sub_question_str" in sample_info:
                text_processor_argument_sq = {
                    "text": sample_info["sub_question_str"]
                }
                processed_question_sq = self.text_processor(
                    text_processor_argument_sq)
                current_sample.text_sq = processed_question_sq["text"]

            if "other_question_str" in sample_info:
                text_processor_argument_oq = {
                    "text": sample_info["other_question_str"]
                }
                processed_question_oq = self.text_processor(
                    text_processor_argument_oq)
                current_sample.text_oq = processed_question_oq["text"]
            else:
                current_sample.text_oq = current_sample.text_sq

            current_sample.question_text = sample_info["question_str"]
            current_sample.reasoning_question = sample_info["question_str"]
            current_sample.reasoning_answer = sample_info["answers"][0]
            current_sample.sub_question = sample_info["sub_question_str"]
            current_sample.other_question = sample_info["sub_question_str"]
            current_sample.text_len = torch.tensor(len(
                sample_info["question_tokens"]),
                                                   dtype=torch.int)

        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)

        return current_sample
コード例 #14
0
    def get_item(self, idx):

        data = self.vqamb_data[idx]

        current_sample = Sample()

        # store queston and image id
        current_sample.img_id = data['id']
        current_sample.qa_id = data['qa_index']

        # store points
        current_sample.points = data['points']

        obj = data['all_objs'][0]
        xmin, ymin, xmax, ymax = obj['x'], obj[
            'y'], obj['x'] + obj['w'], obj['y'] + obj['h']
        current_sample.gt_bbox = torch.Tensor([xmin, ymin, xmax, ymax])

        # process question
        question = data["question"]
        tokens = tokenize(question, remove=["?"])

        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        # process answers
        processed = self.answer_processor({"answers": data['all_ans']})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"][
            1:]  # remove unknown index

        # Detectron features ----------------
        # TODO: read in detectron image instead if detectron is to be built
        detectron_path = self.detectron_folder + str(data['id'])
        bbox_path = self.bbox_folder + str(data['id'])
        if 'pt' in self.detectron_folder:
            point = data['points'][0]
            detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
            bbox_path += ',' + str(point['x']) + ',' + str(point['y'])

        detectron_path += '.pt'
        bbox_path += '.pt'

        detectron_feat = torch.load(
            detectron_path, map_location=torch.device('cpu')).squeeze()
        # bbox_feat = torch.load(bbox_path, map_location=torch.device('cpu')).squeeze()
        '''if detectron_feat.shape[0] == 2048:
			detectron_feat = detectron_feat.unsqueeze(0)
			bbox_feat = bbox_feat.unsqueeze(0)
		'''
        '''
		if self.config.grid:
			 detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T
		'''
        # x_down = max(int(round(pt['x']/600)), 18)
        # y_down = int(round(pt['y']/800), 25)

        # preproessing for grid features only
        # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T

        # Pad features to fixed length
        if self.config.grid:
            MAX_FEAT = 608

        else:
            MAX_FEAT = 100

        if self.config.pad_detectron:
            if detectron_feat.shape[0] > MAX_FEAT:
                detectron_feat = detectron_feat[:MAX_FEAT]
                # bbox_feat = bbox_feat[:MAX_FEAT]
            elif detectron_feat.shape[0] < MAX_FEAT:
                pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0],
                                  detectron_feat.shape[1])
                detectron_feat = torch.cat([detectron_feat, pad], dim=0)
                pad = torch.zeros(MAX_FEAT - bbox_feat.shape[0],
                                  bbox_feat.shape[1])
                bbox_feat = torch.cat([bbox_feat, pad], dim=0)
        '''
		else:
			if detectron_feat.dim() > 1:
				detectron_feat = torch.zeros(2048)
		'''
        # current_sample.bbox = bbox_feat
        current_sample.image_feature_0 = detectron_feat
        # ---------------------------------------------

        return current_sample
コード例 #15
0
    def get_item(self, idx):

        data = self.objpart_data[idx]

        current_sample = Sample()

        # store queston and image id
        current_sample.img_id = data['id']
        # current_sample.qa_id = data['qa_id']

        if data['ans'] == 'part':
            current_sample.part = 1

        else:
            current_sample.part = 0

        # store points
        current_sample.point = data['point']

        # process question
        question = data["question"]
        tokens = tokenize(question, remove=["?"])

        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        # process answers
        processed = self.answer_processor({"answers": [data['ans']]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"][
            1:]  # remove unknown index

        # Detectron features ----------------
        # TODO: read in detectron image instead if detectron is to be built
        detectron_path = self.detectron_folder + str(data['id'])
        if 'pt' in self.detectron_folder:  # hacky way of assessing point supervision
            point = data['point']
            detectron_path += ',' + str(point['x']) + ',' + str(point['y'])

        detectron_path += '.pt'

        detectron_feat = torch.load(
            detectron_path, map_location=torch.device('cpu')).squeeze()

        # hardcode bounding box and read it

        # x_down = max(int(round(pt['x']/600)), 18)
        # y_down = int(round(pt['y']/800), 25)

        # preproessing for grid features only
        # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T

        # Pad features to fixed length
        MAX_FEAT = 100

        if self.config.pad_detectron:
            if detectron_feat.shape[0] > MAX_FEAT:
                detectron_feat = detectron_feat[:MAX_FEAT]
            elif detectron_feat.shape[0] < MAX_FEAT:
                pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0],
                                  detectron_feat.shape[1])
                detectron_feat = torch.cat([detectron_feat, pad], dim=0)
        '''
		else:
			if detectron_feat.dim() > 1:
				detectron_feat = torch.zeros(2048)
		'''
        current_sample.image_feature_0 = detectron_feat
        # ---------------------------------------------

        return current_sample
コード例 #16
0
    def load_item(self, idx):
        sample_info = self.imdb[idx]
        current_sample = Sample()

        if "question_tokens" in sample_info:
            text_processor_argument = {
                "tokens": sample_info["question_tokens"]
            }
        else:
            #text_processor_argument = {"text": sample_info["question"]}
            text_processor_argument = {
                "text": sample_info["main_question_str"]
            }
            if "sub_question_str" in sample_info:
                text_processor_argument_sq = {
                    "text": sample_info["sub_question_str"]
                }
            if "other_question_str" in sample_info:
                text_processor_argument_oq = {
                    "text": sample_info["other_question_str"]
                }

        processed_question = self.text_processor(text_processor_argument)
        processed_question_sq = self.text_processor(text_processor_argument_sq)
        processed_question_oq = self.text_processor(text_processor_argument_oq)

        current_sample.text = processed_question["text"]
        current_sample.text_sq = processed_question_sq["text"]
        current_sample.text_oq = processed_question_oq["text"]
        current_sample.question_text = sample_info["main_question_str"]
        current_sample.reasoning_question = sample_info["main_question_str"]
        current_sample.reasoning_answer = sample_info["main_answer_str"][0]
        #current_sample.image_url = sample_info["img_path"]
        current_sample.image_url = sample_info["image_path"]

        current_sample.sub_question = sample_info["sub_question_str"]
        current_sample.other_question = sample_info["other_question_str"]

        current_sample.question_id = torch.tensor(sample_info["question_id"],
                                                  dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = sample_info["image_id"]

        current_sample.text_len = torch.tensor(
            #len(sample_info["question_tokens"]), dtype=torch.int
            len(sample_info["main_question_tokens"]),
            dtype=torch.int)

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add details for OCR like OCR bbox, vectors, tokens here
        current_sample = self.add_ocr_details(sample_info, current_sample)
        # Depending on whether we are using soft copy this can add
        # dynamic answer space
        current_sample = self.add_answer_info(sample_info, current_sample)
        #print("current sample : {}".format(current_sample))
        #pdb.set_trace()
        #print("Current sample : {}".format(current_sample))

        return current_sample