def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() text_processor_argument = {"tokens": sample_info["question_tokens"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def predict(self, img_paths, qud): """ We enable batch prediction here :return: """ with torch.no_grad(): detectron_features = self.get_detectron_features( img_paths) # a list of image features resnet_features = self.get_resnet_features( img_paths) # [batch_size, 196, 2048] sample_list = [] for i in range(len(detectron_features)): sample = Sample() processed_text = self.vqa_demo.text_processor({"text": qud}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features[i] sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features[i] sample_list.append(sample) sample_list = SampleList(sample_list) sample_list = sample_list.to("cuda") scores = self.vqa_demo.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) batch_probs = [] batch_answers = [] for i in range(scores.shape[0]): top_indices = indices[i] top_scores = actual[i] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.vqa_demo.answer_processor.idx2word( top_indices[idx].item())) batch_probs.append(probs) batch_answers.append(answers) ## if the memory becomes an issue, we then clear this # gc.collect() # torch.cuda.empty_cache() # list is of batch_size # [[ans_1, ans_2], [ans_1, ans2]] return batch_probs, batch_answers
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # 1. Load text (question words) # breaking change from VQA2Dataset: # load the entire question string, not tokenized questions, since we # switch to BERT tokenizer in M4C and do online tokenization question_str = (sample_info['question'] if 'question' in sample_info else sample_info['question_str']) processed_question = self.text_processor({"question": question_str}) current_sample.text = processed_question['token_inds'] current_sample.text_len = processed_question['token_num'] # 2. Load object # object bounding box information current_sample.obj_bbox = self.copy_processor( {"blob": sample_info["obj_normalized_boxes"]})["bbox"] current_sample.obj_mask = min( self.copy_processor({"blob": sample_info["obj_normalized_boxes"]})["mask"], 36) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def getAnswers(self, image, question, meta=None): first = time.time() meta = meta or str(image) image = Image.open(image).convert('RGB') if isinstance(image, str) else \ image.convert('RGB') print(f'Tiki : Getting Answers : {meta}, {question}') with torch.no_grad(): detectron_features = self.get_detectron_features(image) resnet152_features = self.get_resnet152_features(image) start = time.time() sample = Sample() processed_text = self.text_processor({'text': question}) sample.text = processed_text['text'] sample.text_len = len(processed_text['tokens']) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {'max_features': torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet152_features sample_list = SampleList([sample]) sample_list = sample_list.to(self.device.type) scores = self.pythiaVQA_model(sample_list)['scores'] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] answers = [] for rank, score in enumerate(top_scores): answers.append({ 'rank': rank, 'answer': self.answer_processor.idx2word(top_indices[rank].item()), 'probability': score.item() }) answer = answers[0]['answer'] end = time.time() print( f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds' ) processing['PythiaVQA'] = end - start gc.collect() torch.cuda.empty_cache() last = time.time() processing['InferTime'] = last - first return question, answer, answers
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.dataset_name = self.dataset if self.dataset == 'train_vqa': text_processor_argument = { "tokens": sample_info["question_tokens"] } processed_question = self.text_processor(text_processor_argument) current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.text = processed_question["text"] current_sample.question_text = sample_info["question_str"] current_sample.text_sq = current_sample.text current_sample.text_oq = current_sample.text current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["question_str"] current_sample.other_question = sample_info["question_str"] elif self.dataset == 'train_introspect' or self.dataset == 'test': text_processor_argument = { "text": sample_info["main_question_str"] } processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info[ "main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.text_len = torch.tensor(len( sample_info["main_question_tokens"]), dtype=torch.int) else: text_processor_argument = {"text": sample_info["question_str"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] else: current_sample.text_oq = current_sample.text_sq current_sample.question_text = sample_info["question_str"] current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["sub_question_str"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() if "question_tokens" in sample_info: text_processor_argument = { "tokens": sample_info["question_tokens"] } else: #text_processor_argument = {"text": sample_info["question"]} text_processor_argument = { "text": sample_info["main_question_str"] } if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question = self.text_processor(text_processor_argument) processed_question_sq = self.text_processor(text_processor_argument_sq) processed_question_oq = self.text_processor(text_processor_argument_oq) current_sample.text = processed_question["text"] current_sample.text_sq = processed_question_sq["text"] current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info["main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] #current_sample.image_url = sample_info["img_path"] current_sample.image_url = sample_info["image_path"] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] current_sample.text_len = torch.tensor( #len(sample_info["question_tokens"]), dtype=torch.int len(sample_info["main_question_tokens"]), dtype=torch.int) if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) #print("current sample : {}".format(current_sample)) #pdb.set_trace() #print("Current sample : {}".format(current_sample)) return current_sample