def predict(self, url, feat_name, get_features=False): with torch.no_grad(): detectron_features = get_detectron_features([url], self.detection_model, False, feat_name, self.cuda_device) # returns a single-element list detectron_features = detectron_features[0] sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(self.cuda_device) tokens = self.caption_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() if not get_features: return tokens else: return tokens, detectron_features
def forward(self, images, image_scales, transitions=None): feature_list = self.encoder(images, image_scales) image_features = feature_list[0] assert len( feature_list) == 1, 'current model only support batch size 1' sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = image_features # it seems answers work as a place holder here # hence, it does not matter what it's size is sample.answers = torch.zeros((1, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(device) # set_trace() if transitions is not None: sample_list.transitions = transitions output = self.decoder(sample_list) tokens = output['captions'] caption = tokens.tolist()[0] caption = self.decoder.caption_processor(caption)['caption'] return caption
def test_nucleus_sampling(self): vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES) model_config = self.config.model_attributes.butd model = TestDecoderModel(model_config, vocab) model.build() model.to("cuda") model.eval() sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = torch.randn(100, 2048) sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) tokens = model(sample_list)["captions"] # these are expected tokens for sum_threshold = 0.5 expected_tokens = [ 1.0000e+00, 2.9140e+03, 5.9210e+03, 2.2040e+03, 5.0550e+03, 9.2240e+03, 4.5120e+03, 1.8200e+02, 3.6490e+03, 6.4090e+03, 2.0000e+00 ] self.assertEqual(tokens[0].tolist(), expected_tokens)
def predict(self, url): with torch.no_grad(): detectron_features = self.get_detectron_features(url) sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") tokens = self.pythia_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() return tokens
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.dataset_name = self.dataset if self.dataset == 'train_vqa': text_processor_argument = { "tokens": sample_info["question_tokens"] } processed_question = self.text_processor(text_processor_argument) current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.text = processed_question["text"] current_sample.question_text = sample_info["question_str"] current_sample.text_sq = current_sample.text current_sample.text_oq = current_sample.text current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["question_str"] current_sample.other_question = sample_info["question_str"] elif self.dataset == 'train_introspect' or self.dataset == 'test': text_processor_argument = { "text": sample_info["main_question_str"] } processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info[ "main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.text_len = torch.tensor(len( sample_info["main_question_tokens"]), dtype=torch.int) else: text_processor_argument = {"text": sample_info["question_str"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] else: current_sample.text_oq = current_sample.text_sq current_sample.question_text = sample_info["question_str"] current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["sub_question_str"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample