def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() text_processor_argument = {"tokens": sample_info["question_tokens"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() if self._dataset_type != "test": text_processor_argument = {"tokens": sample_info["caption_tokens"]} processed_caption = self.text_processor(text_processor_argument) current_sample.text = processed_caption["text"] current_sample.caption_id = torch.tensor(sample_info["caption_id"], dtype=torch.int) current_sample.caption_len = torch.tensor(len( sample_info["caption_tokens"]), dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add reference captions to sample current_sample = self.add_reference_caption(sample_info, current_sample) return current_sample
def predict(self, img_paths, qud): """ We enable batch prediction here :return: """ with torch.no_grad(): detectron_features = self.get_detectron_features( img_paths) # a list of image features resnet_features = self.get_resnet_features( img_paths) # [batch_size, 196, 2048] sample_list = [] for i in range(len(detectron_features)): sample = Sample() processed_text = self.vqa_demo.text_processor({"text": qud}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features[i] sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features[i] sample_list.append(sample) sample_list = SampleList(sample_list) sample_list = sample_list.to("cuda") scores = self.vqa_demo.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) batch_probs = [] batch_answers = [] for i in range(scores.shape[0]): top_indices = indices[i] top_scores = actual[i] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.vqa_demo.answer_processor.idx2word( top_indices[idx].item())) batch_probs.append(probs) batch_answers.append(answers) ## if the memory becomes an issue, we then clear this # gc.collect() # torch.cuda.empty_cache() # list is of batch_size # [[ans_1, ans_2], [ans_1, ans2]] return batch_probs, batch_answers
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] current_sample.qa_id = data['qa_id'] # process question question = data["question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['answer']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) if self.config.spatial: point = data['point'] # current_sample.point = point detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # 1. Load text (question words) # breaking change from VQA2Dataset: # load the entire question string, not tokenized questions, since we # switch to BERT tokenizer in M4C and do online tokenization question_str = (sample_info['question'] if 'question' in sample_info else sample_info['question_str']) processed_question = self.text_processor({"question": question_str}) current_sample.text = processed_question['token_inds'] current_sample.text_len = processed_question['token_num'] # 2. Load object # object bounding box information current_sample.obj_bbox = self.copy_processor( {"blob": sample_info["obj_normalized_boxes"]})["bbox"] current_sample.obj_mask = min( self.copy_processor({"blob": sample_info["obj_normalized_boxes"]})["mask"], 36) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/clevr/logit_bce"] accuracy = output["metrics"]["train/clevr/accuracy"] np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ -0.7598285675048828, -0.07029829174280167, -0.20382611453533173, -0.06990239024162292, 0.7965695858001709, 0.4730074405670166, -0.30569902062416077, 0.4244227707386017, 0.6511023044586182, 0.2480515092611313, -0.5087617635726929, -0.7675772905349731, 0.4361543357372284, 0.0018743239343166351, 0.6774630546569824, 0.30618518590927124, -0.398895800113678, -0.13120117783546448, -0.4433199465274811, -0.25969570875167847, 0.6798790097236633, -0.34090861678123474, 0.0384102463722229, 0.2484571784734726, 0.0456063412129879, -0.428459107875824, -0.026385333389043808, -0.1570669412612915, -0.2377825379371643, 0.3231588304042816, 0.21098048985004425, -0.712349534034729 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/logit_bce"] accuracy = output["metrics"]["train/accuracy"] np.testing.assert_almost_equal(loss.item(), 23.4751, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ 2.2298e-02, -2.4975e-01, -1.1960e-01, -5.0868e-01, -9.3013e-02, 1.3202e-02, -1.7536e-01, -3.1180e-01, 1.5369e-01, 1.4900e-01, 1.9006e-01, -1.9457e-01, 1.4924e-02, -1.1032e-01, 1.3777e-01, -3.6255e-01, -2.9327e-01, 5.6247e-04, -4.8732e-01, 4.0949e-01, -1.1069e-01, 2.9696e-01, 4.1903e-02, 6.7062e-02, 7.0094e-01, -1.9898e-01, -2.9502e-03, -3.9040e-01, 1.2218e-01, 3.7895e-02, 2.4472e-02, 1.7213e-01 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def get_item(self, idx): data = self.questions[idx] # Each call to get_item from dataloader returns a Sample class object which # collated by our special batch collator to a SampleList which is basically # a attribute based batch in layman terms current_sample = Sample() question = data["question"] tokens = tokenize(question, keep=[";", ","], remove=["?", "."]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] processed = self.answer_processor({"answers": [data["answer"]]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] image_path = os.path.join(self.image_path, data["image_filename"]) image = np.true_divide(Image.open(image_path).convert("RGB"), 255) image = image.astype(np.float32) current_sample.image = torch.from_numpy(image.transpose(2, 0, 1)) return current_sample
def load_item(self, idx): sample = Sample() image_id = self.annotations[idx][0] image_folder = image_id.split('_')[0] caption = self.annotations[idx][1] tokens = tokenize(caption) tokens = ['<s>'] + tokens + ['</s>'] # use text_processor to process caption # pad sequence, convert token to indices and add SOS, EOS token # text_processor already contains a pre-processor to tokenize caption caption_p = self.text_processor({'tokens': tokens}) sample.text = caption_p['text'] sample.caption_len = torch.tensor(len(tokens), dtype=torch.int) # sample.target = caption_p['text'] sample.answers = torch.stack([caption_p['text']]) # generate image features image_path = os.path.join(self.image_dir, image_folder, image_id) image, image_scale = self._image_transform(image_path) with torch.no_grad(): image_features = self.feature_extractor([image], [image_scale]) image_features = image_features[0] sample.image_feature_0 = image_features.cpu() return sample
def getAnswers(self, image, question, meta=None): first = time.time() meta = meta or str(image) image = Image.open(image).convert('RGB') if isinstance(image, str) else \ image.convert('RGB') print(f'Tiki : Getting Answers : {meta}, {question}') with torch.no_grad(): detectron_features = self.get_detectron_features(image) resnet152_features = self.get_resnet152_features(image) start = time.time() sample = Sample() processed_text = self.text_processor({'text': question}) sample.text = processed_text['text'] sample.text_len = len(processed_text['tokens']) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {'max_features': torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet152_features sample_list = SampleList([sample]) sample_list = sample_list.to(self.device.type) scores = self.pythiaVQA_model(sample_list)['scores'] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] answers = [] for rank, score in enumerate(top_scores): answers.append({ 'rank': rank, 'answer': self.answer_processor.idx2word(top_indices[rank].item()), 'probability': score.item() }) answer = answers[0]['answer'] end = time.time() print( f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds' ) processing['PythiaVQA'] = end - start gc.collect() torch.cuda.empty_cache() last = time.time() processing['InferTime'] = last - first return question, answer, answers
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] # current_sample.qa_id = data['qa_id'] # store points current_sample.point = data['point'] # data['points'] bbox = data['bbox'] current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']]) # process question question = data["pt_question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['ans']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) point = data['point'] # point = data['points'][0] if 'pt' in self.detectron_folder: detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- # read in bounding boxes (hardcoded for now) bbox_path = '' bbox_path += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.pt_bbox = bboxes # read in image bounding boxes bbox_path = '' bbox_path += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.img_bbox = bboxes # Context features -------------------- if self.config.use_context: context_path = self.context_folder + str(data['id']) context_path += ',' + str(point['x']) + ',' + str(point['y']) context_path += '.pt' context_feat = torch.load(context_path, map_location=torch.device('cpu')) context_feat = context_feat.squeeze() orig_dim = context_feat.shape[0] if self.config.pad_context: if context_feat.shape[0] > 100: context_feat = context_feat[:100] elif context_feat.shape[0] < 100: pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1]) context_feat = torch.cat([context_feat, pad], dim=0) current_sample.context_feature_0 = context_feat # --------------------------------------------- return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.dataset_name = self.dataset if self.dataset == 'train_vqa': text_processor_argument = { "tokens": sample_info["question_tokens"] } processed_question = self.text_processor(text_processor_argument) current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.text = processed_question["text"] current_sample.question_text = sample_info["question_str"] current_sample.text_sq = current_sample.text current_sample.text_oq = current_sample.text current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["question_str"] current_sample.other_question = sample_info["question_str"] elif self.dataset == 'train_introspect' or self.dataset == 'test': text_processor_argument = { "text": sample_info["main_question_str"] } processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info[ "main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.text_len = torch.tensor(len( sample_info["main_question_tokens"]), dtype=torch.int) else: text_processor_argument = {"text": sample_info["question_str"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] else: current_sample.text_oq = current_sample.text_sq current_sample.question_text = sample_info["question_str"] current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["sub_question_str"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] current_sample.qa_id = data['qa_index'] # store points current_sample.points = data['points'] obj = data['all_objs'][0] xmin, ymin, xmax, ymax = obj['x'], obj[ 'y'], obj['x'] + obj['w'], obj['y'] + obj['h'] current_sample.gt_bbox = torch.Tensor([xmin, ymin, xmax, ymax]) # process question question = data["question"] tokens = tokenize(question, remove=["?"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": data['all_ans']}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][ 1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) bbox_path = self.bbox_folder + str(data['id']) if 'pt' in self.detectron_folder: point = data['points'][0] detectron_path += ',' + str(point['x']) + ',' + str(point['y']) bbox_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' bbox_path += '.pt' detectron_feat = torch.load( detectron_path, map_location=torch.device('cpu')).squeeze() # bbox_feat = torch.load(bbox_path, map_location=torch.device('cpu')).squeeze() '''if detectron_feat.shape[0] == 2048: detectron_feat = detectron_feat.unsqueeze(0) bbox_feat = bbox_feat.unsqueeze(0) ''' ''' if self.config.grid: detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T ''' # x_down = max(int(round(pt['x']/600)), 18) # y_down = int(round(pt['y']/800), 25) # preproessing for grid features only # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T # Pad features to fixed length if self.config.grid: MAX_FEAT = 608 else: MAX_FEAT = 100 if self.config.pad_detectron: if detectron_feat.shape[0] > MAX_FEAT: detectron_feat = detectron_feat[:MAX_FEAT] # bbox_feat = bbox_feat[:MAX_FEAT] elif detectron_feat.shape[0] < MAX_FEAT: pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) pad = torch.zeros(MAX_FEAT - bbox_feat.shape[0], bbox_feat.shape[1]) bbox_feat = torch.cat([bbox_feat, pad], dim=0) ''' else: if detectron_feat.dim() > 1: detectron_feat = torch.zeros(2048) ''' # current_sample.bbox = bbox_feat current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def get_item(self, idx): data = self.objpart_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] # current_sample.qa_id = data['qa_id'] if data['ans'] == 'part': current_sample.part = 1 else: current_sample.part = 0 # store points current_sample.point = data['point'] # process question question = data["question"] tokens = tokenize(question, remove=["?"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['ans']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][ 1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) if 'pt' in self.detectron_folder: # hacky way of assessing point supervision point = data['point'] detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load( detectron_path, map_location=torch.device('cpu')).squeeze() # hardcode bounding box and read it # x_down = max(int(round(pt['x']/600)), 18) # y_down = int(round(pt['y']/800), 25) # preproessing for grid features only # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T # Pad features to fixed length MAX_FEAT = 100 if self.config.pad_detectron: if detectron_feat.shape[0] > MAX_FEAT: detectron_feat = detectron_feat[:MAX_FEAT] elif detectron_feat.shape[0] < MAX_FEAT: pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) ''' else: if detectron_feat.dim() > 1: detectron_feat = torch.zeros(2048) ''' current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() if "question_tokens" in sample_info: text_processor_argument = { "tokens": sample_info["question_tokens"] } else: #text_processor_argument = {"text": sample_info["question"]} text_processor_argument = { "text": sample_info["main_question_str"] } if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question = self.text_processor(text_processor_argument) processed_question_sq = self.text_processor(text_processor_argument_sq) processed_question_oq = self.text_processor(text_processor_argument_oq) current_sample.text = processed_question["text"] current_sample.text_sq = processed_question_sq["text"] current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info["main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] #current_sample.image_url = sample_info["img_path"] current_sample.image_url = sample_info["image_path"] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] current_sample.text_len = torch.tensor( #len(sample_info["question_tokens"]), dtype=torch.int len(sample_info["main_question_tokens"]), dtype=torch.int) if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) #print("current sample : {}".format(current_sample)) #pdb.set_trace() #print("Current sample : {}".format(current_sample)) return current_sample