def prepare_batch(self, batch): """ Can be possibly overriden in your child class Prepare batch for passing to model. Whatever returned from here will be directly passed to model's forward function Parameters ---------- batch: dict Dictionary containing information about the next sample in batched form Returns ------- data: dict Contains variables in the following format 'texts': The main text of the batch which can be a question in most of the cases 'image_features': Image features for the current batch 'image_dim': Max BBoxes for the images 'contexts': Contains context relevant to current batch, in VisDial this will be the history of the dialog till now obs: tensor Tensor containing observations for the current batch """ # Should be a SampleList if not isinstance(batch, SampleList): # Try converting to SampleList batch = SampleList(batch) batch = batch.to(self._device) return batch
def predict(self, url, feat_name, get_features=False): with torch.no_grad(): detectron_features = get_detectron_features([url], self.detection_model, False, feat_name, self.cuda_device) # returns a single-element list detectron_features = detectron_features[0] sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(self.cuda_device) tokens = self.caption_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() if not get_features: return tokens else: return tokens, detectron_features
def forward(self, images, image_scales, transitions=None): feature_list = self.encoder(images, image_scales) image_features = feature_list[0] assert len( feature_list) == 1, 'current model only support batch size 1' sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = image_features # it seems answers work as a place holder here # hence, it does not matter what it's size is sample.answers = torch.zeros((1, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(device) # set_trace() if transitions is not None: sample_list.transitions = transitions output = self.decoder(sample_list) tokens = output['captions'] caption = tokens.tolist()[0] caption = self.decoder.caption_processor(caption)['caption'] return caption
def predict(self, img_paths, qud): """ We enable batch prediction here :return: """ with torch.no_grad(): detectron_features = self.get_detectron_features( img_paths) # a list of image features resnet_features = self.get_resnet_features( img_paths) # [batch_size, 196, 2048] sample_list = [] for i in range(len(detectron_features)): sample = Sample() processed_text = self.vqa_demo.text_processor({"text": qud}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features[i] sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features[i] sample_list.append(sample) sample_list = SampleList(sample_list) sample_list = sample_list.to("cuda") scores = self.vqa_demo.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) batch_probs = [] batch_answers = [] for i in range(scores.shape[0]): top_indices = indices[i] top_scores = actual[i] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.vqa_demo.answer_processor.idx2word( top_indices[idx].item())) batch_probs.append(probs) batch_answers.append(answers) ## if the memory becomes an issue, we then clear this # gc.collect() # torch.cuda.empty_cache() # list is of batch_size # [[ans_1, ans_2], [ans_1, ans2]] return batch_probs, batch_answers
def predict(self, url): with torch.no_grad(): detectron_features = self.get_detectron_features(url) sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") tokens = self.pythia_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() return tokens
def prepare_batch(self, batch): """ Can be possibly overriden in your child class Prepare batch for passing to model. Whatever returned from here will be directly passed to model's forward function. Currently moves the batch to proper device. Args: batch (SampleList): sample list containing the currently loaded batch Returns: sample_list (SampleList): Returns a sample representing current batch loaded """ # Should be a SampleList if not isinstance(batch, SampleList): # Try converting to SampleList batch = SampleList(batch) batch = batch.to(self._device) return batch
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def getAnswers(self, image, question, meta=None): first = time.time() meta = meta or str(image) image = Image.open(image).convert('RGB') if isinstance(image, str) else \ image.convert('RGB') print(f'Tiki : Getting Answers : {meta}, {question}') with torch.no_grad(): detectron_features = self.get_detectron_features(image) resnet152_features = self.get_resnet152_features(image) start = time.time() sample = Sample() processed_text = self.text_processor({'text': question}) sample.text = processed_text['text'] sample.text_len = len(processed_text['tokens']) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {'max_features': torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet152_features sample_list = SampleList([sample]) sample_list = sample_list.to(self.device.type) scores = self.pythiaVQA_model(sample_list)['scores'] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] answers = [] for rank, score in enumerate(top_scores): answers.append({ 'rank': rank, 'answer': self.answer_processor.idx2word(top_indices[rank].item()), 'probability': score.item() }) answer = answers[0]['answer'] end = time.time() print( f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds' ) processing['PythiaVQA'] = end - start gc.collect() torch.cuda.empty_cache() last = time.time() processing['InferTime'] = last - first return question, answer, answers