Esempio n. 1
0
    def _get_sample_list(self):
        bs = 8
        num_feats = 100
        max_sentence_len = 25
        img_dim = 2048
        cls_dim = 3129
        input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long)
        input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long)
        image_feat = torch.rand((bs, num_feats, img_dim))
        position_ids = (torch.arange(
            0, max_sentence_len, dtype=torch.long,
            device=image_feat.device).unsqueeze(0).expand(bs, -1))
        img_pos_feat = torch.rand((bs, num_feats, 7))
        attention_mask = torch.zeros((bs, max_sentence_len + num_feats),
                                     dtype=torch.long)
        image_mask = torch.zeros((bs, num_feats), dtype=torch.long)
        targets = torch.rand((bs, cls_dim))

        sample_list = SampleList()
        sample_list.add_field("input_ids", input_ids)
        sample_list.add_field("input_mask", input_mask)
        sample_list.add_field("image_feat", image_feat)
        sample_list.add_field("img_pos_feat", img_pos_feat)
        sample_list.add_field("attention_mask", attention_mask)
        sample_list.add_field("image_mask", image_mask)
        sample_list.add_field("targets", targets)
        sample_list.add_field("dataset_name", "test")
        sample_list.add_field("dataset_type", "test")
        sample_list.add_field("position_ids", position_ids)
        sample_list.to(get_current_device())

        return sample_list
Esempio n. 2
0
    def __call__(self, image_tensor, text_input=None):
        ''' 
        Allow model to receive both multi-inputs and single image-inputs // Bojia Mao
        '''
        text = self.processor_dict["text_processor"]({"text": self.text})

        sample = Sample()

        if text_input == None:
            sample.text = text["text"]
        else:
            self.__text = text_input
            sample.text = text_input

        if "input_ids" in text:
            sample.update(text)

        sample.image = image_tensor
        sample_list = SampleList([sample])
        sample_list = sample_list.to(
            torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

        output = self.model(sample_list)
        scores = nn.functional.softmax(output["scores"], dim=1)

        return scores
Esempio n. 3
0
    def test_forward(self):
        model_config = self.config.model_config.cnn_lstm

        cnn_lstm = CNNLSTM(model_config)
        cnn_lstm.build()
        cnn_lstm.init_losses()

        self.assertTrue(isinstance(cnn_lstm, torch.nn.Module))

        test_sample = Sample()
        test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long)
        test_sample.image = torch.randn(3, 320, 480)
        test_sample.targets = torch.randn(32)

        test_sample_list = SampleList([test_sample])
        test_sample_list.dataset_type = "train"
        test_sample_list.dataset_name = "clevr"

        test_sample_list = test_sample_list.to(get_current_device())
        cnn_lstm = cnn_lstm.to(get_current_device())

        output = cnn_lstm(test_sample_list)

        scores = output["scores"]
        loss = output["losses"]["train/clevr/logit_bce"]

        np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4)
        self.assertEqual(scores.size(), torch.Size((1, 32)))
Esempio n. 4
0
    def test_pretrained_model(self):
        sample_list = SampleList()

        sample_list.add_field(
            "input_ids",
            torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(),
        )
        sample_list.add_field("input_mask", torch.ones((1, 128)).long())
        sample_list.add_field("segment_ids", torch.zeros(1, 128).long())
        sample_list.add_field("image_feature_0", torch.rand((1, 100, 2048)).float())
        sample_list.add_field(
            "lm_label_ids", torch.zeros((1, 128), dtype=torch.long).fill_(-1)
        )

        self.pretrain_model.eval()
        self.pretrain_model = self.pretrain_model.to(get_current_device())
        sample_list = sample_list.to(get_current_device())

        sample_list.dataset_name = "random"
        sample_list.dataset_type = "test"
        with torch.no_grad():
            model_output = self.pretrain_model(sample_list)

        self.assertTrue("losses" in model_output)
        self.assertTrue("random/test/masked_lm_loss" in model_output["losses"])
        self.assertTrue(model_output["losses"]["random/test/masked_lm_loss"] == 0)
Esempio n. 5
0
    def predict(self, url, text):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": text})
            #sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            encoded_input = tokenizer(text, return_tensors='pt')
            sample.input_ids = encoded_input.input_ids
            sample.input_mask = encoded_input.attention_mask
            sample.segment_ids = encoded_input.token_type_ids

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {"max_features": torch.tensor(100, dtype=torch.long)})

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            output = self.visual_bert(sample_list)

        gc.collect()
        torch.cuda.empty_cache()

        return output
Esempio n. 6
0
    def _get_sample_list(self):
        bs = 8
        num_feats = 100
        max_sentence_len = 25
        img_dim = 2048
        vqa_cls_dim = 3129
        input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long)
        input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long)
        img_feat = torch.rand((bs, num_feats, img_dim))

        max_features = torch.ones((bs, num_feats)) * num_feats
        bbox = torch.randint(50, 200, (bs, num_feats, 4)).float()
        image_height = torch.randint(100, 300, (bs, ))
        image_width = torch.randint(100, 300, (bs, ))
        image_info = {
            "max_features": max_features,
            "bbox": bbox,
            "image_height": image_height,
            "image_width": image_width,
        }
        targets = torch.rand((bs, vqa_cls_dim))
        is_correct = torch.ones((bs, ), dtype=torch.long)

        sample_list = SampleList()
        sample_list.add_field("input_ids", input_ids)
        sample_list.add_field("image_feature_0", img_feat)
        sample_list.add_field("input_mask", input_mask)
        sample_list.add_field("image_info_0", image_info)
        sample_list.add_field("targets", targets)
        sample_list.add_field("is_correct", is_correct)
        sample_list = sample_list.to(get_current_device())
        return sample_list
Esempio n. 7
0
    def classify(self, image: ImageType, text: str, image_tensor = None, zero_image=False, zero_text=False):
        """Classifies a given image and text in it into Hateful/Non-Hateful.
        Image can be a url or a local path or you can directly pass a PIL.Image.Image
        object. Text needs to be a sentence containing all text in the image.

        Args:
            image (ImageType): Image to be classified
            text (str): Text in the image
            zero_image: zero out the image features when classifying
            zero_text: zero out the text features when classifying
            return_type: either "prob" or "logits"

        Returns:
            {"label": 0, "confidence": 0.56}
        """
        sample = Sample()

        if image_tensor != None:
            sample.image = image_tensor
        else:
            


            if isinstance(image, str):
                if image.startswith("http"):
                    temp_file = tempfile.NamedTemporaryFile()
                    download(image, *os.path.split(temp_file.name), disable_tqdm=True)
                    image = tv_helpers.default_loader(temp_file.name)
                    temp_file.close()
                else:
                    image = tv_helpers.default_loader(image)

        
            image = self.processor_dict["image_processor"](image)
            sample.image = image

        text = self.processor_dict["text_processor"]({"text": text})


        sample.text = text["text"]
        if "input_ids" in text:
            sample.update(text)

        sample_list = SampleList([sample])
        device = next(self.model.parameters()).device
        sample_list = sample_list.to(device)
        output = self.model(sample_list, zero_image=zero_image, zero_text=zero_text)
        scores = nn.functional.softmax(output["scores"], dim=1)

        if image_tensor != None:
            return scores

        confidence, label = torch.max(scores, dim=1)

        return {"label": label.item(), "confidence": confidence.item()}
Esempio n. 8
0
    def classify(self, image: ImageType, text: str):
        """Classifies a given image and text in it into Hateful/Non-Hateful.
        Image can be a url or a local path or you can directly pass a PIL.Image.Image
        object. Text needs to be a sentence containing all text in the image.

            >>> from mmf.models.mmbt import MMBT
            >>> model = MMBT.from_pretrained("mmbt.hateful_memes.images")
            >>> model.classify("some_url", "some_text")
            {"label": 0, "confidence": 0.56}

        Args:
            image (ImageType): Image to be classified
            text (str): Text in the image

        Returns:
            bool: Whether image is hateful (1) or non hateful (0)
        """
        if isinstance(image, str):
            if image.startswith("http"):
                temp_file = tempfile.NamedTemporaryFile()
                download(image,
                         *os.path.split(temp_file.name),
                         disable_tqdm=True)
                image = tv_helpers.default_loader(temp_file.name)
                temp_file.close()
            else:
                image = tv_helpers.default_loader(image)

        text = self.processor_dict["text_processor"]({"text": text})
        image = self.processor_dict["image_processor"](image)

        sample = Sample()
        sample.text = text["text"]
        if "input_ids" in text:
            sample.update(text)

        sample.image = image
        sample_list = SampleList([sample])
        device = next(self.model.parameters()).device
        sample_list = sample_list.to(device)

        output = self.model(sample_list)
        scores = nn.functional.softmax(output["scores"], dim=1)
        confidence, label = torch.max(scores, dim=1)

        return {"label": label.item(), "confidence": confidence.item()}
Esempio n. 9
0
def compare_torchscript_transformer_models(model, vocab_size):
    test_sample = Sample()
    test_sample.input_ids = torch.randint(low=0, high=vocab_size, size=(128,)).long()
    test_sample.input_mask = torch.ones(128).long()
    test_sample.segment_ids = torch.zeros(128).long()
    test_sample.image_feature_0 = torch.rand((1, 100, 2048)).float()
    test_sample.image = torch.rand((3, 300, 300)).float()
    test_sample_list = SampleList([test_sample])

    model = model.to(get_current_device())
    test_sample_list = test_sample_list.to(get_current_device())

    with torch.no_grad():
        model_output = model(test_sample_list)

    script_model = torch.jit.script(model)
    with torch.no_grad():
        script_output = script_model(test_sample_list)

    return torch.equal(model_output["scores"], script_output["scores"])
Esempio n. 10
0
    def predict(self, Q, F, topk):
        with torch.no_grad():
            detectron_features = torch.from_numpy(F)
            #resnet_features = torch.from_numpy(R)
            
            processed_text = self.text_processor({"text": Q})
            sample = Sample(processed_text)
            #sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample({
              "max_features": torch.tensor(100, dtype=torch.long)
            })

            #sample.image_feature_1 = resnet_features
            #print('res: ', resnet_features.shape)
            sample_list = SampleList([sample])
            #print(type(sample_list))
            sample_list = sample_list.to("cuda")

            scores = self.bert_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(topk, dim=1)

            top_indices = indices[:topk]
            top_scores = actual[:topk]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores[0]):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[0][idx].item())
                )
    
        gc.collect()
        torch.cuda.empty_cache()

        return probs, answers
Esempio n. 11
0
    def prepare_batch(self, batch):
        """
        Can be possibly overridden in your child class

        Prepare batch for passing to model. Whatever returned from here will
        be directly passed to model's forward function. Currently moves the batch to
        proper device.

        Args:
            batch (SampleList): sample list containing the currently loaded batch

        Returns:
            sample_list (SampleList): Returns a sample representing current
                batch loaded
        """
        # Should be a SampleList
        if not isinstance(batch, SampleList):
            # Try converting to SampleList
            batch = SampleList(batch)
        batch = batch.to(self._device)
        return batch
Esempio n. 12
0
    def forward(self, image_path: str, text: dict, image_format: str = "path"):
        text_output = self.processor["text_processor"](text)
        if image_format == "path":
            img = np.array(Image.open(image_path))
        elif image_format == "url":
            img = np.array(
                Image.open(requests.get(image_path, stream=True).raw))
        img = torch.as_tensor(img)

        if self.model_items["config"].image_feature_encodings.type == "frcnn":
            max_detect = self.model_items[
                "config"].image_feature_encodings.params.max_detections
            image_preprocessed, sizes, scales_yx = self.processor[
                "image_processor"](img)
            image_output = self.feature_extractor(
                image_preprocessed,
                sizes=sizes,
                scales_yx=scales_yx,
                padding=None,
                max_detections=max_detect,
                return_tensors="pt",
            )
            image_output = image_output[0]
        else:
            image_preprocessed = self.processor["image_processor"](img)
            image_output = self.feature_extractor(image_preprocessed)

        sample = Sample(text_output)
        sample.image_feature_0 = image_output
        sample_list = SampleList([sample])
        sample_list = sample_list.to(get_current_device())
        self.model = self.model.to(get_current_device())
        output = self.model(sample_list)
        sample_list.id = [sample_list.input_ids[0][0]]
        report = Report(sample_list, output)
        answers = self.processor["output_processor"](report)
        answer = self.processor["answer_processor"].idx2word(
            answers[0]["answer"])

        return answer
    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {"max_features": torch.tensor(100, dtype=torch.long)})

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[idx].item()))

        gc.collect()
        torch.cuda.empty_cache()

        return probs, answers
Esempio n. 14
0
    def _get_sample_list(self):
        bs = 8
        num_feats = 70

        class MockObj:
            pass

        mock_input = MockObj()
        mock_vinvl_input_tensors(mock_input, bs=bs, num_feats=num_feats)

        input_mask = torch.ones_like(mock_input.input_ids)
        max_features = torch.ones((bs, num_feats)) * num_feats
        bbox = torch.randint(50, 200, (bs, num_feats, 4)).float()
        image_height = torch.randint(100, 300, (bs,))
        image_width = torch.randint(100, 300, (bs,))
        image_info = {
            "max_features": max_features,
            "bbox": bbox,
            "image_height": image_height,
            "image_width": image_width,
        }

        sample_list = SampleList()
        sample_list.add_field("input_ids", mock_input.input_ids)
        sample_list.add_field("input_ids_corrupt", mock_input.input_ids)
        sample_list.add_field("input_ids_masked", mock_input.input_ids)
        sample_list.add_field("image_feature_0", mock_input.img_feats)
        sample_list.add_field("image_info_0", image_info)
        sample_list.add_field("input_mask", input_mask)
        sample_list.add_field("input_mask_corrupt", input_mask)
        sample_list.add_field("segment_ids", mock_input.token_type_ids)
        sample_list.add_field("segment_ids_corrupt", mock_input.token_type_ids)
        sample_list.add_field("labels", mock_input.labels)
        sample_list.add_field("contrastive_labels", mock_input.contrastive_labels)
        sample_list.add_field("lm_label_ids", mock_input.lm_label_ids)
        sample_list = sample_list.to(get_current_device())
        sample_list.dataset_name = "test"
        sample_list.dataset_type = "test"
        return sample_list
Esempio n. 15
0
    def test_pretrained_model(self):
        sample_list = SampleList()

        sample_list.add_field(
            "input_ids",
            torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(),
        )
        sample_list.add_field("input_mask", torch.ones((1, 128)).long())
        sample_list.add_field("segment_ids", torch.zeros(1, 128).long())
        sample_list.add_field("image", torch.rand((1, 3, 224, 224)).float())
        sample_list.add_field("targets", torch.rand((1, 3129)).float())

        self.pretrain_model.eval()
        self.pretrain_model = self.pretrain_model.to(get_current_device())
        sample_list = sample_list.to(get_current_device())

        sample_list.dataset_name = "test"
        sample_list.dataset_type = "test"
        with torch.no_grad():
            model_output = self.pretrain_model(sample_list)

        self.assertTrue("losses" in model_output)
        self.assertTrue("test/test/logit_bce" in model_output["losses"])
Esempio n. 16
0
    def classify(self,
                 image: ImageType,
                 text: str,
                 image_tensor=None,
                 zero_image=False,
                 zero_text=False):
        """Classifies a given image and text in it into Hateful/Non-Hateful.
        Image can be a url or a local path or you can directly pass a PIL.Image.Image
        object. Text needs to be a sentence containing all text in the image.

        Args:
            image (ImageType): Image to be classified
            text (str): Text in the image
            zero_image: zero out the image features when classifying
            zero_text: zero out the text features when classifying

        Returns:
            {"label": 0, "confidence": 0.56}
        """

        if image_tensor != None:
            image_tenosr = torch.unsqueeze(image_tenosr, 0)
            im_feature_0, im_info_0 = torchRay_feat_extract(image_tensor)
        else:
            if isinstance(image, str):
                if image.startswith("http"):
                    temp_file = tempfile.NamedTemporaryFile()
                    download(image,
                             *os.path.split(temp_file.name),
                             disable_tqdm=True)
                    image = tv_helpers.default_loader(temp_file.name)
                    temp_file.close()
                else:
                    image = tv_helpers.default_loader(image)
            _, _, im_feature_0, im_info_0 = self.feature_extractor.extract_features(
                image_dir=image, save_single=False)

        text = self.processor_dict["text_processor"]({"text": text})
        sample = Sample()
        sample.text = text["text"]
        if "input_ids" in text:
            sample.update(text)

        # extract feature
        #_, _, im_feature_0, im_info_0 = self.feature_extractor.extract_features(
        #    image_dir=image, save_single=False
        #)

        # re-format the sample list
        sample_im_info = Sample()

        # process the bounding boxes for vilbert
        if self.model_name == "vilbert":
            bbox = np.array(im_info_0["bbox"])
            image_w = im_info_0["image_width"]
            image_h = im_info_0["image_height"]
            new_bbox = np.zeros((bbox.shape[0], 5), dtype=bbox.dtype)

            new_bbox[:, 0] = bbox[:, 0] / image_w
            new_bbox[:, 1] = bbox[:, 1] / image_h
            new_bbox[:, 2] = (bbox[:, 2]) / image_w
            new_bbox[:, 3] = (bbox[:, 3]) / image_h
            new_bbox[:, 4] = ((bbox[:, 2] - bbox[:, 0]) *
                              (bbox[:, 3] - bbox[:, 1]) / (image_w * image_h))

            sample_im_info.bbox = torch.from_numpy(new_bbox)
        else:
            sample_im_info.bbox = torch.from_numpy(np.array(im_info_0["bbox"]))

        sample_im_info.num_boxes = torch.from_numpy(
            np.array(im_info_0["num_boxes"]))
        sample_im_info.objects = torch.from_numpy(
            np.array(im_info_0["objects"]))
        sample_im_info.image_width = torch.from_numpy(
            np.array(im_info_0["image_width"]))
        sample_im_info.image_height = torch.from_numpy(
            np.array(im_info_0["image_height"]))
        sample_im_info.cls_prob = torch.from_numpy(
            np.array(im_info_0["cls_prob"]))
        sample_list_info = SampleList([sample_im_info])

        sample.image_feature_0 = im_feature_0
        sample.dataset_name = "hateful_memes"

        sample_list = SampleList([sample])
        sample_list.image_info_0 = sample_list_info
        device = next(self.model.parameters()).device
        sample_list = sample_list.to(device)

        output = self.model(sample_list)
        scores = nn.functional.softmax(output["scores"], dim=1)

        if image_tensor != None:
            return scores
        confidence, label = torch.max(scores, dim=1)

        return {"label": label.item(), "confidence": confidence.item()}