Beispiel #1
0
    def _init_processors(self):
        with open(os.path.join(BASE_VQA_DIR_PATH, "model_data/pythia.yaml")) as f:
            config = yaml.load(f)

        config = ConfigNode(config)
        # Remove warning
        config.training_parameters.evalai_inference = True
        registry.register("config", config)

        self.config = config

        vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
        text_processor_config = vqa_config.processors.text_processor
        answer_processor_config = vqa_config.processors.answer_processor

        text_processor_config.params.vocab.vocab_file = os.path.join(
            BASE_VQA_DIR_PATH, "model_data/vocabulary_100k.txt"
        )
        answer_processor_config.params.vocab_file = os.path.join(
            BASE_VQA_DIR_PATH, "model_data/answers_vqa.txt"
        )
        # Add preprocessor as that will needed when we are getting questions from user
        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(answer_processor_config.params)

        registry.register("vqa2_text_processor", self.text_processor)
        registry.register("vqa2_answer_processor", self.answer_processor)
        registry.register(
            "vqa2_num_final_outputs", self.answer_processor.get_vocab_size()
        )
Beispiel #2
0
    def build_processors(self):

        print('Tiki : Initializing : Building - Text Processors')

        with open('/final/data/pythia.yaml') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        config = ConfigNode(config)
        config.training_parameters.evalai_inference = True  # Remove warning
        registry.register('config', config)

        self.config = config
        vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
        text_processor_config = vqa_config.processors.text_processor
        answer_processor_config = vqa_config.processors.answer_processor

        text_processor_config.params.vocab.vocab_file = '/final/data/vocabulary_100k.txt'
        answer_processor_config.params.vocab_file = '/final/data/answers_vqa.txt'

        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(
            answer_processor_config.params)

        registry.register('vqa2_text_processor', self.text_processor)
        registry.register('vqa2_answer_processor', self.answer_processor)
        registry.register('vqa2_num_final_outputs',
                          self.answer_processor.get_vocab_size())
Beispiel #3
0
class PythiaDemo:
    TARGET_IMAGE_SIZE = [448, 448]
    CHANNEL_MEAN = [0.485, 0.456, 0.406]
    CHANNEL_STD = [0.229, 0.224, 0.225]

    def __init__(self):
        self._init_processors()
        self.pythia_model = self._build_pythia_model()
        self.detection_model = self._build_detection_model()
        self.resnet_model = self._build_resnet_model()

    def _init_processors(self):
        with open(os.path.join(BASE_VQA_DIR_PATH, "model_data/pythia.yaml")) as f:
            config = yaml.load(f)

        config = ConfigNode(config)
        # Remove warning
        config.training_parameters.evalai_inference = True
        registry.register("config", config)

        self.config = config

        vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
        text_processor_config = vqa_config.processors.text_processor
        answer_processor_config = vqa_config.processors.answer_processor

        text_processor_config.params.vocab.vocab_file = os.path.join(
            BASE_VQA_DIR_PATH, "model_data/vocabulary_100k.txt"
        )
        answer_processor_config.params.vocab_file = os.path.join(
            BASE_VQA_DIR_PATH, "model_data/answers_vqa.txt"
        )
        # Add preprocessor as that will needed when we are getting questions from user
        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(answer_processor_config.params)

        registry.register("vqa2_text_processor", self.text_processor)
        registry.register("vqa2_answer_processor", self.answer_processor)
        registry.register(
            "vqa2_num_final_outputs", self.answer_processor.get_vocab_size()
        )

    def _build_pythia_model(self):
        state_dict = torch.load(
            os.path.join(BASE_VQA_DIR_PATH, "model_data/pythia.pth")
        )
        model_config = self.config.model_attributes.pythia
        model_config.model_data_dir = os.path.join(BASE_VQA_DIR_PATH, "model_data/")
        model = Pythia(model_config)
        model.build()
        model.init_losses_and_metrics()

        if list(state_dict.keys())[0].startswith("module") and not hasattr(
            model, "module"
        ):
            state_dict = self._multi_gpu_state_to_single(state_dict)

        model.load_state_dict(state_dict)
        model.to("cuda")
        model.eval()

        return model

    def _build_resnet_model(self):
        self.data_transforms = transforms.Compose(
            [
                transforms.Resize(self.TARGET_IMAGE_SIZE),
                transforms.ToTensor(),
                transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
            ]
        )
        resnet152 = models.resnet152(pretrained=True)
        resnet152.eval()
        modules = list(resnet152.children())[:-2]
        self.resnet152_model = torch.nn.Sequential(*modules)
        self.resnet152_model.to("cuda")

    def _multi_gpu_state_to_single(self, state_dict):
        new_sd = {}
        for k, v in state_dict.items():
            if not k.startswith("module."):
                raise TypeError("Not a multiple GPU state of dict")
            k1 = k[7:]
            new_sd[k1] = v
        return new_sd

    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {"max_features": torch.tensor(100, dtype=torch.long)}
            )

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(self.answer_processor.idx2word(top_indices[idx].item()))

        gc.collect()
        torch.cuda.empty_cache()

        return probs, answers

    def _build_detection_model(self):

        cfg.merge_from_file(
            os.path.join(BASE_VQA_DIR_PATH, "model_data/detectron_model.yaml")
        )
        cfg.freeze()

        model = build_detection_model(cfg)
        checkpoint = torch.load(
            os.path.join(BASE_VQA_DIR_PATH, "model_data/detectron_model.pth"),
            map_location=torch.device("cpu"),
        )

        load_state_dict(model, checkpoint.pop("model"))

        model.to("cuda")
        model.eval()
        return model

    def get_actual_image(self, image_path):
        if image_path.startswith("http"):
            path = requests.get(image_path, stream=True).raw
        else:
            path = image_path

        return path

    def _image_transform(self, image_path):
        path = self.get_actual_image(image_path)

        img = Image.open(path).convert("RGB")
        im = np.array(img).astype(np.float32)
        im = im[:, :, ::-1]
        im -= np.array([102.9801, 115.9465, 122.7717])
        im_shape = im.shape
        im_size_min = np.min(im_shape[0:2])
        im_size_max = np.max(im_shape[0:2])
        im_scale = float(800) / float(im_size_min)
        # Prevent the biggest axis from being more than max_size
        if np.round(im_scale * im_size_max) > 1333:
            im_scale = float(1333) / float(im_size_max)
        im = cv2.resize(
            im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR
        )
        img = torch.from_numpy(im).permute(2, 0, 1)
        return img, im_scale

    def _process_feature_extraction(
        self, output, im_scales, feat_name="fc6", conf_thresh=0.2
    ):
        batch_size = len(output[0]["proposals"])
        n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
        score_list = output[0]["scores"].split(n_boxes_per_image)
        score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
        feats = output[0][feat_name].split(n_boxes_per_image)
        cur_device = score_list[0].device

        feat_list = []

        for i in range(batch_size):
            dets = output[0]["proposals"][i].bbox / im_scales[i]
            scores = score_list[i]

            max_conf = torch.zeros((scores.shape[0])).to(cur_device)

            for cls_ind in range(1, scores.shape[1]):
                cls_scores = scores[:, cls_ind]
                keep = nms(dets, cls_scores, 0.5)
                max_conf[keep] = torch.where(
                    cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep]
                )

            keep_boxes = torch.argsort(max_conf, descending=True)[:100]
            feat_list.append(feats[i][keep_boxes])
        return feat_list

    def masked_unk_softmax(self, x, dim, mask_idx):
        x1 = F.softmax(x, dim=dim)
        x1[:, mask_idx] = 0
        x1_sum = torch.sum(x1, dim=1, keepdim=True)
        y = x1 / x1_sum
        return y

    def get_resnet_features(self, image_path):
        path = self.get_actual_image(image_path)
        img = Image.open(path).convert("RGB")
        img_transform = self.data_transforms(img)

        if img_transform.shape[0] == 1:
            img_transform = img_transform.expand(3, -1, -1)
        img_transform = img_transform.unsqueeze(0).to("cuda")

        features = self.resnet152_model(img_transform).permute(0, 2, 3, 1)
        features = features.view(196, 2048)
        return features

    def get_detectron_features(self, image_path):
        im, im_scale = self._image_transform(image_path)
        img_tensor, im_scales = [im], [im_scale]
        current_img_list = to_image_list(img_tensor, size_divisible=32)
        current_img_list = current_img_list.to("cuda")
        with torch.no_grad():
            output = self.detection_model(current_img_list)
        feat_list = self._process_feature_extraction(output, im_scales, "fc6", 0.2)
        return feat_list[0]
Beispiel #4
0
class Tiki:

    TARGET_IMAGE_SIZE = [448, 448]
    CHANNEL_MEAN = [0.485, 0.456, 0.406]
    CHANNEL_STD = [0.229, 0.224, 0.225]

    def __init__(self, device_type='cpu'):

        print(f'Tiki : Initializing : Device Type is {device_type}')

        start = time.time()

        self.device = torch.device(device_type)

        self.build_processors()

        self.resnet152_model = self.build_resnet_model(
        )  # https://github.com/pytorch/vision
        self.detectron_model = self.build_detect_model(
        )  # https://github.com/facebookresearch/Detectron
        self.pythiaVQA_model = self.build_pythia_model(
        )  # https://github.com/facebookresearch/Pythia

        end = time.time()
        print(f'Tiki : Initializing : Finished in {end-start:7.3f} Seconds\n')

        processing['BuildTime'] = end - start

    def build_processors(self):

        print('Tiki : Initializing : Building - Text Processors')

        with open('/final/data/pythia.yaml') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        config = ConfigNode(config)
        config.training_parameters.evalai_inference = True  # Remove warning
        registry.register('config', config)

        self.config = config
        vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
        text_processor_config = vqa_config.processors.text_processor
        answer_processor_config = vqa_config.processors.answer_processor

        text_processor_config.params.vocab.vocab_file = '/final/data/vocabulary_100k.txt'
        answer_processor_config.params.vocab_file = '/final/data/answers_vqa.txt'

        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(
            answer_processor_config.params)

        registry.register('vqa2_text_processor', self.text_processor)
        registry.register('vqa2_answer_processor', self.answer_processor)
        registry.register('vqa2_num_final_outputs',
                          self.answer_processor.get_vocab_size())

    def build_pythia_model(self):
        def multi_gpu_state_to_single(state_dict):

            new_sd = {}
            for k, v in state_dict.items():
                if not k.startswith('module.'):
                    raise TypeError('Not a multiple GPU state of dict')
                k1 = k[7:]
                new_sd[k1] = v

            return new_sd

        print('Tiki : Initializing : Building - PythiaVQA')

        state_dict = torch.load('/final/data/pythia.pth',
                                map_location=self.device)

        # for param_tensor in state_dict:
        #     print(param_tensor, '\t', state_dict[param_tensor].size())

        model_config = self.config.model_attributes.pythia
        model_config.model_data_dir = '/final/data/'

        model = Pythia(model_config)
        model.build()
        model.init_losses_and_metrics()

        if list(state_dict.keys())[0].startswith('module') and not hasattr(
                model, 'module'):
            state_dict = multi_gpu_state_to_single(state_dict)

        model.load_state_dict(state_dict)

        model.to(self.device.type)
        model.eval()

        return model

    def build_resnet_model(self):

        print('Tiki : Initializing : Building - ResNet152')

        self.data_transforms = transforms.Compose([
            transforms.Resize(self.TARGET_IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
        ])

        resnet152 = models.resnet152(pretrained=True)
        modules = list(resnet152.children())[:-2]
        model = torch.nn.Sequential(*modules)

        model.to(self.device.type)
        model.eval()

        return model

    def build_detect_model(self):

        print('Tiki : Initializing : Building - Detectron')

        cfg.merge_from_file('/final/data/detectron_model.yaml')
        cfg.freeze()

        model = build_detection_model(cfg)
        chkpt = torch.load('/final/data/detectron_model.pth',
                           map_location=self.device)

        load_state_dict(model, chkpt.pop('model'))

        model.to(self.device.type)
        model.eval()

        return model

# PREDICTION ------------------------------------------------------------------------------------

    def feature_extract(self,
                        output,
                        image_scales,
                        feature_name='fc6',
                        conf_thresh=0.2):

        batch_size = len(output[0]['proposals'])
        n_boxes_per_image = [len(_) for _ in output[0]['proposals']]
        score_list = output[0]['scores'].split(n_boxes_per_image)
        score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
        features = output[0][feature_name].split(n_boxes_per_image)
        cur_device = score_list[0].device

        feature_list = []

        for i in range(batch_size):

            dets = output[0]['proposals'][i].bbox / image_scales[i]
            scores = score_list[i]
            max_conf = torch.zeros((scores.shape[0])).to(cur_device)

            for cls_ind in range(1, scores.shape[1]):
                cls_scores = scores[:, cls_ind]
                keep = nms(dets, cls_scores, 0.5)
                max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
                                             cls_scores[keep], max_conf[keep])

            keep_boxes = torch.argsort(max_conf, descending=True)[:100]
            feature_list.append(features[i][keep_boxes])

        return feature_list

    def image_transform(self, image):

        array = np.array(image).astype(np.float32)
        array = array[:, :, ::-1]
        array -= np.array([102.9801, 115.9465, 122.7717])

        shape = array.shape
        size_min = np.min(shape[0:2])
        size_max = np.max(shape[0:2])
        scale = float(800) / float(size_min)

        # prevent the biggest axis from being more than max_size
        if np.round(scale * size_max) > 1333:
            scale = float(1333) / float(size_max)

        array = cv2.resize(array,
                           None,
                           None,
                           fx=scale,
                           fy=scale,
                           interpolation=cv2.INTER_LINEAR)

        trans = torch.from_numpy(array).permute(2, 0, 1)

        return trans, scale

    def get_detectron_features(self, image):

        start = time.time()
        image, scale = self.image_transform(image)
        images = to_image_list([image], size_divisible=32)
        images = images.to(self.device.type)

        with torch.no_grad():
            output = self.detectron_model(images)

        features = self.feature_extract(output, [scale], 'fc6', 0.2)
        end = time.time()

        print(
            f'Tiki : Getting Features : Detectron - Finished in {end-start:7.3f} Seconds'
        )

        processing['Detectron'] = end - start

        return features[0]

    def get_resnet152_features(self, image):
        def transform_image(image):

            image = self.data_transforms(image)

            if image.shape[0] == 1:
                image = trans.expand(3, -1, -1)

            image = image.unsqueeze(0)
            image = image.to(self.device.type)

            return image

        start = time.time()
        image = transform_image(image)
        features = self.resnet152_model(image).permute(0, 2, 3, 1)
        features = features.view(196, 2048)
        end = time.time()

        print(
            f'Tiki : Getting Features : ResNet152 - Finished in {end-start:7.3f} Seconds'
        )

        processing['ResNet152'] = end - start

        return features

    def getAnswers(self, image, question, meta=None):

        first = time.time()
        meta = meta or str(image)
        image = Image.open(image).convert('RGB') if isinstance(image, str) else \
                image.convert('RGB')

        print(f'Tiki : Getting Answers : {meta}, {question}')

        with torch.no_grad():

            detectron_features = self.get_detectron_features(image)
            resnet152_features = self.get_resnet152_features(image)

            start = time.time()
            sample = Sample()

            processed_text = self.text_processor({'text': question})
            sample.text = processed_text['text']
            sample.text_len = len(processed_text['tokens'])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample(
                {'max_features': torch.tensor(100, dtype=torch.long)})

            sample.image_feature_1 = resnet152_features

            sample_list = SampleList([sample])

            sample_list = sample_list.to(self.device.type)

            scores = self.pythiaVQA_model(sample_list)['scores']
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            answers = []

            for rank, score in enumerate(top_scores):
                answers.append({
                    'rank':
                    rank,
                    'answer':
                    self.answer_processor.idx2word(top_indices[rank].item()),
                    'probability':
                    score.item()
                })

            answer = answers[0]['answer']

            end = time.time()

        print(
            f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds'
        )

        processing['PythiaVQA'] = end - start

        gc.collect()

        torch.cuda.empty_cache()

        last = time.time()

        processing['InferTime'] = last - first

        return question, answer, answers
Beispiel #5
0
#!/usr/bin/env python3

import yaml

from pythia.utils.configuration import ConfigNode
from pythia.tasks.processors import VocabProcessor, VQAAnswerProcessor

from torchvision import models

if __name__ == '__main__':

    resnet152 = models.resnet152(pretrained=True)

    with open('/final/data/pythia.yaml') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    config = ConfigNode(config)
    vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
    text_processor_config = vqa_config.processors.text_processor
    answer_processor_config = vqa_config.processors.answer_processor

    text_processor_config.params.vocab.vocab_file = '/final/data/vocabulary_100k.txt'
    answer_processor_config.params.vocab_file = '/final/data/answers_vqa.txt'

    text_processor = VocabProcessor(text_processor_config.params)
    answer_processor = VQAAnswerProcessor(answer_processor_config.params)