def _get_sample_list(self): bs = 8 num_feats = 100 max_sentence_len = 25 img_dim = 2048 cls_dim = 3129 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long) image_feat = torch.rand((bs, num_feats, img_dim)) position_ids = (torch.arange( 0, max_sentence_len, dtype=torch.long, device=image_feat.device).unsqueeze(0).expand(bs, -1)) img_pos_feat = torch.rand((bs, num_feats, 7)) attention_mask = torch.zeros((bs, max_sentence_len + num_feats), dtype=torch.long) image_mask = torch.zeros((bs, num_feats), dtype=torch.long) targets = torch.rand((bs, cls_dim)) sample_list = SampleList() sample_list.add_field("input_ids", input_ids) sample_list.add_field("input_mask", input_mask) sample_list.add_field("image_feat", image_feat) sample_list.add_field("img_pos_feat", img_pos_feat) sample_list.add_field("attention_mask", attention_mask) sample_list.add_field("image_mask", image_mask) sample_list.add_field("targets", targets) sample_list.add_field("dataset_name", "test") sample_list.add_field("dataset_type", "test") sample_list.add_field("position_ids", position_ids) sample_list.to(get_current_device()) return sample_list
def __call__(self, image_tensor, text_input=None): ''' Allow model to receive both multi-inputs and single image-inputs // Bojia Mao ''' text = self.processor_dict["text_processor"]({"text": self.text}) sample = Sample() if text_input == None: sample.text = text["text"] else: self.__text = text_input sample.text = text_input if "input_ids" in text: sample.update(text) sample.image = image_tensor sample_list = SampleList([sample]) sample_list = sample_list.to( torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) output = self.model(sample_list) scores = nn.functional.softmax(output["scores"], dim=1) return scores
def test_forward(self): model_config = self.config.model_config.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" test_sample_list = test_sample_list.to(get_current_device()) cnn_lstm = cnn_lstm.to(get_current_device()) output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/clevr/logit_bce"] np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4) self.assertEqual(scores.size(), torch.Size((1, 32)))
def test_pretrained_model(self): sample_list = SampleList() sample_list.add_field( "input_ids", torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(), ) sample_list.add_field("input_mask", torch.ones((1, 128)).long()) sample_list.add_field("segment_ids", torch.zeros(1, 128).long()) sample_list.add_field("image_feature_0", torch.rand((1, 100, 2048)).float()) sample_list.add_field( "lm_label_ids", torch.zeros((1, 128), dtype=torch.long).fill_(-1) ) self.pretrain_model.eval() self.pretrain_model = self.pretrain_model.to(get_current_device()) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "random" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.pretrain_model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("random/test/masked_lm_loss" in model_output["losses"]) self.assertTrue(model_output["losses"]["random/test/masked_lm_loss"] == 0)
def predict(self, url, text): with torch.no_grad(): detectron_features = self.get_detectron_features(url) sample = Sample() processed_text = self.text_processor({"text": text}) #sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) encoded_input = tokenizer(text, return_tensors='pt') sample.input_ids = encoded_input.input_ids sample.input_mask = encoded_input.attention_mask sample.segment_ids = encoded_input.token_type_ids sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") output = self.visual_bert(sample_list) gc.collect() torch.cuda.empty_cache() return output
def _get_sample_list(self): bs = 8 num_feats = 100 max_sentence_len = 25 img_dim = 2048 vqa_cls_dim = 3129 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long) img_feat = torch.rand((bs, num_feats, img_dim)) max_features = torch.ones((bs, num_feats)) * num_feats bbox = torch.randint(50, 200, (bs, num_feats, 4)).float() image_height = torch.randint(100, 300, (bs, )) image_width = torch.randint(100, 300, (bs, )) image_info = { "max_features": max_features, "bbox": bbox, "image_height": image_height, "image_width": image_width, } targets = torch.rand((bs, vqa_cls_dim)) is_correct = torch.ones((bs, ), dtype=torch.long) sample_list = SampleList() sample_list.add_field("input_ids", input_ids) sample_list.add_field("image_feature_0", img_feat) sample_list.add_field("input_mask", input_mask) sample_list.add_field("image_info_0", image_info) sample_list.add_field("targets", targets) sample_list.add_field("is_correct", is_correct) sample_list = sample_list.to(get_current_device()) return sample_list
def classify(self, image: ImageType, text: str, image_tensor = None, zero_image=False, zero_text=False): """Classifies a given image and text in it into Hateful/Non-Hateful. Image can be a url or a local path or you can directly pass a PIL.Image.Image object. Text needs to be a sentence containing all text in the image. Args: image (ImageType): Image to be classified text (str): Text in the image zero_image: zero out the image features when classifying zero_text: zero out the text features when classifying return_type: either "prob" or "logits" Returns: {"label": 0, "confidence": 0.56} """ sample = Sample() if image_tensor != None: sample.image = image_tensor else: if isinstance(image, str): if image.startswith("http"): temp_file = tempfile.NamedTemporaryFile() download(image, *os.path.split(temp_file.name), disable_tqdm=True) image = tv_helpers.default_loader(temp_file.name) temp_file.close() else: image = tv_helpers.default_loader(image) image = self.processor_dict["image_processor"](image) sample.image = image text = self.processor_dict["text_processor"]({"text": text}) sample.text = text["text"] if "input_ids" in text: sample.update(text) sample_list = SampleList([sample]) device = next(self.model.parameters()).device sample_list = sample_list.to(device) output = self.model(sample_list, zero_image=zero_image, zero_text=zero_text) scores = nn.functional.softmax(output["scores"], dim=1) if image_tensor != None: return scores confidence, label = torch.max(scores, dim=1) return {"label": label.item(), "confidence": confidence.item()}
def classify(self, image: ImageType, text: str): """Classifies a given image and text in it into Hateful/Non-Hateful. Image can be a url or a local path or you can directly pass a PIL.Image.Image object. Text needs to be a sentence containing all text in the image. >>> from mmf.models.mmbt import MMBT >>> model = MMBT.from_pretrained("mmbt.hateful_memes.images") >>> model.classify("some_url", "some_text") {"label": 0, "confidence": 0.56} Args: image (ImageType): Image to be classified text (str): Text in the image Returns: bool: Whether image is hateful (1) or non hateful (0) """ if isinstance(image, str): if image.startswith("http"): temp_file = tempfile.NamedTemporaryFile() download(image, *os.path.split(temp_file.name), disable_tqdm=True) image = tv_helpers.default_loader(temp_file.name) temp_file.close() else: image = tv_helpers.default_loader(image) text = self.processor_dict["text_processor"]({"text": text}) image = self.processor_dict["image_processor"](image) sample = Sample() sample.text = text["text"] if "input_ids" in text: sample.update(text) sample.image = image sample_list = SampleList([sample]) device = next(self.model.parameters()).device sample_list = sample_list.to(device) output = self.model(sample_list) scores = nn.functional.softmax(output["scores"], dim=1) confidence, label = torch.max(scores, dim=1) return {"label": label.item(), "confidence": confidence.item()}
def compare_torchscript_transformer_models(model, vocab_size): test_sample = Sample() test_sample.input_ids = torch.randint(low=0, high=vocab_size, size=(128,)).long() test_sample.input_mask = torch.ones(128).long() test_sample.segment_ids = torch.zeros(128).long() test_sample.image_feature_0 = torch.rand((1, 100, 2048)).float() test_sample.image = torch.rand((3, 300, 300)).float() test_sample_list = SampleList([test_sample]) model = model.to(get_current_device()) test_sample_list = test_sample_list.to(get_current_device()) with torch.no_grad(): model_output = model(test_sample_list) script_model = torch.jit.script(model) with torch.no_grad(): script_output = script_model(test_sample_list) return torch.equal(model_output["scores"], script_output["scores"])
def predict(self, Q, F, topk): with torch.no_grad(): detectron_features = torch.from_numpy(F) #resnet_features = torch.from_numpy(R) processed_text = self.text_processor({"text": Q}) sample = Sample(processed_text) #sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) #sample.image_feature_1 = resnet_features #print('res: ', resnet_features.shape) sample_list = SampleList([sample]) #print(type(sample_list)) sample_list = sample_list.to("cuda") scores = self.bert_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(topk, dim=1) top_indices = indices[:topk] top_scores = actual[:topk] probs = [] answers = [] for idx, score in enumerate(top_scores[0]): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[0][idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def prepare_batch(self, batch): """ Can be possibly overridden in your child class Prepare batch for passing to model. Whatever returned from here will be directly passed to model's forward function. Currently moves the batch to proper device. Args: batch (SampleList): sample list containing the currently loaded batch Returns: sample_list (SampleList): Returns a sample representing current batch loaded """ # Should be a SampleList if not isinstance(batch, SampleList): # Try converting to SampleList batch = SampleList(batch) batch = batch.to(self._device) return batch
def forward(self, image_path: str, text: dict, image_format: str = "path"): text_output = self.processor["text_processor"](text) if image_format == "path": img = np.array(Image.open(image_path)) elif image_format == "url": img = np.array( Image.open(requests.get(image_path, stream=True).raw)) img = torch.as_tensor(img) if self.model_items["config"].image_feature_encodings.type == "frcnn": max_detect = self.model_items[ "config"].image_feature_encodings.params.max_detections image_preprocessed, sizes, scales_yx = self.processor[ "image_processor"](img) image_output = self.feature_extractor( image_preprocessed, sizes=sizes, scales_yx=scales_yx, padding=None, max_detections=max_detect, return_tensors="pt", ) image_output = image_output[0] else: image_preprocessed = self.processor["image_processor"](img) image_output = self.feature_extractor(image_preprocessed) sample = Sample(text_output) sample.image_feature_0 = image_output sample_list = SampleList([sample]) sample_list = sample_list.to(get_current_device()) self.model = self.model.to(get_current_device()) output = self.model(sample_list) sample_list.id = [sample_list.input_ids[0][0]] report = Report(sample_list, output) answers = self.processor["output_processor"](report) answer = self.processor["answer_processor"].idx2word( answers[0]["answer"]) return answer
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item())) gc.collect() torch.cuda.empty_cache() return probs, answers
def _get_sample_list(self): bs = 8 num_feats = 70 class MockObj: pass mock_input = MockObj() mock_vinvl_input_tensors(mock_input, bs=bs, num_feats=num_feats) input_mask = torch.ones_like(mock_input.input_ids) max_features = torch.ones((bs, num_feats)) * num_feats bbox = torch.randint(50, 200, (bs, num_feats, 4)).float() image_height = torch.randint(100, 300, (bs,)) image_width = torch.randint(100, 300, (bs,)) image_info = { "max_features": max_features, "bbox": bbox, "image_height": image_height, "image_width": image_width, } sample_list = SampleList() sample_list.add_field("input_ids", mock_input.input_ids) sample_list.add_field("input_ids_corrupt", mock_input.input_ids) sample_list.add_field("input_ids_masked", mock_input.input_ids) sample_list.add_field("image_feature_0", mock_input.img_feats) sample_list.add_field("image_info_0", image_info) sample_list.add_field("input_mask", input_mask) sample_list.add_field("input_mask_corrupt", input_mask) sample_list.add_field("segment_ids", mock_input.token_type_ids) sample_list.add_field("segment_ids_corrupt", mock_input.token_type_ids) sample_list.add_field("labels", mock_input.labels) sample_list.add_field("contrastive_labels", mock_input.contrastive_labels) sample_list.add_field("lm_label_ids", mock_input.lm_label_ids) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "test" sample_list.dataset_type = "test" return sample_list
def test_pretrained_model(self): sample_list = SampleList() sample_list.add_field( "input_ids", torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(), ) sample_list.add_field("input_mask", torch.ones((1, 128)).long()) sample_list.add_field("segment_ids", torch.zeros(1, 128).long()) sample_list.add_field("image", torch.rand((1, 3, 224, 224)).float()) sample_list.add_field("targets", torch.rand((1, 3129)).float()) self.pretrain_model.eval() self.pretrain_model = self.pretrain_model.to(get_current_device()) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "test" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.pretrain_model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("test/test/logit_bce" in model_output["losses"])
def classify(self, image: ImageType, text: str, image_tensor=None, zero_image=False, zero_text=False): """Classifies a given image and text in it into Hateful/Non-Hateful. Image can be a url or a local path or you can directly pass a PIL.Image.Image object. Text needs to be a sentence containing all text in the image. Args: image (ImageType): Image to be classified text (str): Text in the image zero_image: zero out the image features when classifying zero_text: zero out the text features when classifying Returns: {"label": 0, "confidence": 0.56} """ if image_tensor != None: image_tenosr = torch.unsqueeze(image_tenosr, 0) im_feature_0, im_info_0 = torchRay_feat_extract(image_tensor) else: if isinstance(image, str): if image.startswith("http"): temp_file = tempfile.NamedTemporaryFile() download(image, *os.path.split(temp_file.name), disable_tqdm=True) image = tv_helpers.default_loader(temp_file.name) temp_file.close() else: image = tv_helpers.default_loader(image) _, _, im_feature_0, im_info_0 = self.feature_extractor.extract_features( image_dir=image, save_single=False) text = self.processor_dict["text_processor"]({"text": text}) sample = Sample() sample.text = text["text"] if "input_ids" in text: sample.update(text) # extract feature #_, _, im_feature_0, im_info_0 = self.feature_extractor.extract_features( # image_dir=image, save_single=False #) # re-format the sample list sample_im_info = Sample() # process the bounding boxes for vilbert if self.model_name == "vilbert": bbox = np.array(im_info_0["bbox"]) image_w = im_info_0["image_width"] image_h = im_info_0["image_height"] new_bbox = np.zeros((bbox.shape[0], 5), dtype=bbox.dtype) new_bbox[:, 0] = bbox[:, 0] / image_w new_bbox[:, 1] = bbox[:, 1] / image_h new_bbox[:, 2] = (bbox[:, 2]) / image_w new_bbox[:, 3] = (bbox[:, 3]) / image_h new_bbox[:, 4] = ((bbox[:, 2] - bbox[:, 0]) * (bbox[:, 3] - bbox[:, 1]) / (image_w * image_h)) sample_im_info.bbox = torch.from_numpy(new_bbox) else: sample_im_info.bbox = torch.from_numpy(np.array(im_info_0["bbox"])) sample_im_info.num_boxes = torch.from_numpy( np.array(im_info_0["num_boxes"])) sample_im_info.objects = torch.from_numpy( np.array(im_info_0["objects"])) sample_im_info.image_width = torch.from_numpy( np.array(im_info_0["image_width"])) sample_im_info.image_height = torch.from_numpy( np.array(im_info_0["image_height"])) sample_im_info.cls_prob = torch.from_numpy( np.array(im_info_0["cls_prob"])) sample_list_info = SampleList([sample_im_info]) sample.image_feature_0 = im_feature_0 sample.dataset_name = "hateful_memes" sample_list = SampleList([sample]) sample_list.image_info_0 = sample_list_info device = next(self.model.parameters()).device sample_list = sample_list.to(device) output = self.model(sample_list) scores = nn.functional.softmax(output["scores"], dim=1) if image_tensor != None: return scores confidence, label = torch.max(scores, dim=1) return {"label": label.item(), "confidence": confidence.item()}