예제 #1
0
class CLS3(Dataset):
    def __init__(self,
                 root_path=None,
                 image_set='train',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        Visual Question Answering Dataset

        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(CLS3, self).__init__()
        cache_dir = False
        assert not cache_mode, 'currently not support cache mode!'

        categories = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle',
            'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight',
            'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat',
            'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
            'frisbee', 'skis', 'snowboard', 'sportsball', 'kite',
            'baseballbat', 'baseballglove', 'skateboard', 'surfboard',
            'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair',
            'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv',
            'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave',
            'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
            'scissors', 'teddybear', 'hairdrier', 'toothbrush'
        ]
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_split = image_set  # HACK: reuse old parameter

        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip = re.compile("(\d)(\,)(\d)")
        self.punct = [
            ';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_',
            '-', '>', '<', '@', '`', ',', '?', '!'
        ]

        self.test_mode = test_mode

        self.root_path = root_path

        self.box_bank = {}

        self.transform = transform
        self.zip_mode = zip_mode

        self.aspect_grouping = aspect_grouping
        self.add_image_as_a_box = add_image_as_a_box

        self.cache_dir = os.path.join(root_path, 'cache')
        # return_offsets_mapping
        model_name = 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name
        self.fast_tokenizer = AutoTokenizer.from_pretrained(
            'bert-base-uncased',
            cache_dir=self.cache_dir,
            use_fast=True,
            return_offsets_mapping=True)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            model_name,
            cache_dir=self.cache_dir)
        self.max_txt_token = 128

        if zip_mode:
            self.zipreader = ZipReader()

        self.anno_aug = 'anno_aug' in kwargs
        self.database = self.load_annotations()
        self.use_img_box = True
        self.random_drop_tags = False
        # if self.aspect_grouping:
        #     self.group_ids = self.group_aspect(self.database)

    @property
    def data_names(self):
        if self.use_img_box:
            if self.test_mode:
                return [
                    'image',
                    'boxes',
                    'im_info',
                    'text',
                    'img_boxes',
                    'text_tags',
                    'id',
                ]
            else:
                return [
                    'image', 'boxes', 'im_info', 'text', 'img_boxes',
                    'text_tags', 'label', 'id'
                ]
        else:
            if self.test_mode:
                return [
                    'image',
                    'boxes',
                    'im_info',
                    'text',
                    'id',
                ]
            else:
                return ['image', 'boxes', 'im_info', 'text', 'label', 'id']

    @property
    def weights_by_class(self):
        labels = []
        num_per_class = collections.defaultdict(lambda: 0)
        for data in self.database:
            labels.append(data['label'])
            num_per_class[data['label']] += 1

        weight_per_class = {
            k: 1 / len(num_per_class) / v
            for k, v in num_per_class.items()
        }
        sampling_weight = [weight_per_class[label] for label in labels]
        return sampling_weight

    def clip_box_and_score(self, box_and_score):
        new_list = []
        for box_sc in box_and_score:
            cliped = {k: min(max(v, 0), 1) for k, v in box_sc.items()}
            new_list.append(cliped)
        return new_list

    def __getitem__(self, index):
        idb = self.database[index]

        # image, boxes, im_info
        image = self._load_image(os.path.join(self.root_path, idb['img']))
        w0, h0 = image.size

        if len(idb['boxes_and_score']) == 0:
            boxes = torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1, 0]])
        else:
            boxes = torch.as_tensor([[
                box_sc['xmin'] * w0,
                box_sc['ymin'] * h0,
                box_sc['xmax'] * w0,
                box_sc['ymax'] * h0,
                box_sc['class_id'],
            ] for box_sc in idb['boxes_and_score']])
            if self.add_image_as_a_box:
                boxes = torch.cat(
                    (torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1, 0]]), boxes),
                    dim=0)

        race_tags = [box_sc['race'] for box_sc in idb['boxes_and_score']]
        gender_tags = [box_sc['gender'] for box_sc in idb['boxes_and_score']]

        im_info = torch.tensor([w0, h0, 1.0, 1.0])
        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)

        flipped = False
        if self.transform is not None:
            image, boxes, _, im_info, flipped = self.transform(
                image, boxes, None, im_info, flipped)

        # question
        if 'token_id' not in idb:
            main_txt = idb['text']
            img_tags = [' '.join(des) for des in idb['partition_description']]
            img_tags_str = ''
            img_tags_part = []

            if not self.random_drop_tags or (self.random_drop_tags
                                             and random.random() > 0.5):
                for p, img_tag in enumerate(img_tags):
                    if img_tag:
                        append_str = img_tag + (' [SEP] ' if
                                                p != len(img_tags) - 1 else '')
                        img_tags_str += append_str
                        img_tags_part += [p] * len(append_str)

            person_tags_str = ''
            person_tags_part = []
            for j, (race, gend) in enumerate(zip(race_tags, gender_tags)):
                if race is not None:
                    is_last = not any(
                        [rt is not None for rt in race_tags[j + 1:]])
                    append_str = f"{race.replace('_', ' ')} {gend}"
                    append_str += "" if is_last else " [SEP] "
                    person_tags_str += append_str
                    person_tags_part += [
                        len(idb['image_partition']) * int(self.use_img_box) +
                        int(self.add_image_as_a_box) + j
                    ] * len(append_str)

            text_with_tag = f"{main_txt} [SEP] {img_tags_str} [SEP] {person_tags_str}"
            # print(f"[{index}] {text_with_tag}")
            result = self.fast_tokenizer(text_with_tag,
                                         return_offsets_mapping=True,
                                         add_special_tokens=False)
            token_id = result['input_ids']
            token_offset = result['offset_mapping']

            if self.use_img_box:
                text_partition = idb['text_char_partition_id']
                text_partition += [0] * len(
                    " [SEP] "
                ) + img_tags_part  # additinoal partition id for [SEP]
                text_partition += [0] * len(
                    " [SEP] "
                ) + person_tags_part  # additinoal partition id for [SEP]
                assert len(text_partition) == len(text_with_tag), \
                    F"{len(text_partition)} != {len(text_with_tag)}"

                token_tags = []
                for a, b in filter(lambda x: x[1] - x[0] > 0, token_offset):
                    char_tags = text_partition[a:b]
                    # print(a, b, char_tags)
                    cnt = collections.Counter(char_tags)
                    token_tags.append(cnt.most_common(1)[0][0])

                idb['text_tags'] = token_tags
                idb['image_partition'] = np.asarray(
                    idb['image_partition'], dtype=np.float32)[
                        ..., :4]  # HACK: remove det score from mmdet
            else:
                idb['text_tags'] = [0] * len(token_id)

            # token_id = self.tokenizer.convert_tokens_to_ids(text_tokens)
            if token_id[-1] == self.fast_tokenizer.sep_token_id:
                token_id = token_id[:-1]
                idb['text_tags'] = idb['text_tags'][:-1]

            if len(token_id) > self.max_txt_token:
                token_id = token_id[:self.max_txt_token]
                idb['text_tags'] = idb['text_tags'][:self.max_txt_token]

            idb['token_id'] = token_id
            assert len(idb['token_id']) == len(idb['text_tags'])
        else:
            token_id = idb['token_id']

        if self.use_img_box:
            if self.test_mode:
                return (
                    image,
                    boxes,
                    im_info,
                    token_id,
                    idb['image_partition'],
                    idb['text_tags'],
                    idb['id'],
                )
            else:
                # print([(self.answer_vocab[i], p.item()) for i, p in enumerate(label) if p.item() != 0])
                label = torch.Tensor([float(idb['label'])])
                return (
                    image,
                    boxes,
                    im_info,
                    token_id,
                    idb['image_partition'],
                    idb['text_tags'],
                    label,
                    idb['id'],
                )
        else:
            if self.test_mode:
                return image, boxes, im_info, token_id, idb['id']
            else:
                # print([(self.answer_vocab[i], p.item()) for i, p in enumerate(label) if p.item() != 0])
                label = torch.Tensor([float(idb['label'])])
                return image, boxes, im_info, token_id, label, idb['id']

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    def load_annotations(self):
        tic = time.time()
        img_name_to_annos = collections.defaultdict(list)

        test_json = os.path.join(self.root_path, 'test_unseen.entity.jsonl')
        dev_json = os.path.join(self.root_path, 'dev_seen.entity.jsonl')
        dev_train_json = os.path.join(self.root_path, 'dev_all.entity.jsonl')
        train_json = (os.path.join(self.root_path, 'train.entity.aug.jsonl')
                      if self.anno_aug else os.path.join(
                          self.root_path, 'train.entity.jsonl'))
        box_annos_json = os.path.join(self.root_path, 'box_annos.race.json')

        test_sample = []
        dev_sample = []
        train_sample = []
        dev_train_sample = []

        with open(train_json, mode='r') as f:
            for line in f.readlines():
                train_sample.append(json.loads(line))

        with open(dev_train_json, mode='r') as f:
            for line in f.readlines():
                dev_train_sample.append(json.loads(line))

        with open(test_json, mode='r') as f:
            for line in f.readlines():
                test_sample.append(json.loads(line))

        with open(dev_json, mode='r') as f:
            for line in f.readlines():
                dev_sample.append(json.loads(line))

        with open(box_annos_json, mode='r') as f:
            box_annos = json.load(f)

        sample_sets = []
        if self.data_split == 'train':
            sample_sets.append(train_sample)
        elif self.data_split == 'val':
            sample_sets.append(dev_sample)
        elif self.data_split == 'train+val':
            sample_sets.append(train_sample)
            sample_sets.append(dev_train_sample)
        elif self.data_split == 'test':
            sample_sets.append(test_sample)
        else:
            raise RuntimeError(f"Unknown dataset split: {self.data_split}")

        for sample_set in sample_sets:
            for sample in sample_set:
                img_name = os.path.basename(sample['img'])
                img_name_to_annos[img_name].append(sample)

        for box_anno in box_annos:
            img_name = box_anno['img_name']
            if img_name in img_name_to_annos:
                for sample in img_name_to_annos[img_name]:
                    sample.update(box_anno)

        print('Done (t={:.2f}s)'.format(time.time() - tic))

        flatten = []
        for annos in img_name_to_annos.values():
            flatten += annos
        return flatten

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def load_precomputed_boxes(self, box_file):
        if box_file in self.box_bank:
            return self.box_bank[box_file]
        else:
            in_data = {}
            with open(box_file, "r") as tsv_in_file:
                reader = csv.DictReader(tsv_in_file,
                                        delimiter='\t',
                                        fieldnames=FIELDNAMES)
                for item in reader:
                    item['image_id'] = int(item['image_id'])
                    item['image_h'] = int(item['image_h'])
                    item['image_w'] = int(item['image_w'])
                    item['num_boxes'] = int(item['num_boxes'])
                    for field in ([
                            'boxes', 'features'
                    ] if self.with_precomputed_visual_feat else ['boxes']):
                        item[field] = np.frombuffer(
                            base64.decodebytes(item[field].encode()),
                            dtype=np.float32).reshape((item['num_boxes'], -1))
                    in_data[item['image_id']] = item
            self.box_bank[box_file] = in_data
            return in_data

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
class VQA_CP(Dataset):
    def __init__(self, image_set, root_path, data_path, answer_vocab_file, use_imdb=True,
                 with_precomputed_visual_feat=False, boxes="36",
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=True, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, toy_dataset=False, toy_samples=128, **kwargs):
        """
        Visual Question Answering Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(VQA_CP, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
                      'boat',
                      'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse',
                      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
                      'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove',
                      'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon',
                      'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut',
                      'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse',
                      'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
                      'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush']
        vqa_question = {
            "train": "vqa/vqacp_v2_train_questions.json",
            "val": "vqa/vqacp_v2_test_questions.json",
        }
        vqa_annot = {
            "train": "vqa/vqacp_v2_train_annotations.json",
            "val": "vqa/vqacp_v2_test_annotations.json",
        }
        
        if boxes == "36":
            precomputed_boxes = {
                'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"),
                'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"),
            }
        elif boxes == "10-100ada":
            precomputed_boxes = {
                'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"),
                'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"),
            }
        else:
            raise ValueError("Not support boxes: {}!".format(boxes))

        self.coco_dataset = {
            "train2014": os.path.join(data_path, "annotations", "instances_train2014.json"),
            "val2014": os.path.join(data_path, "annotations", "instances_val2014.json"),
            "test-dev2015": os.path.join(data_path, "annotations", "image_info_test-dev2015.json"),
            "test2015": os.path.join(data_path, "annotations", "image_info_test2015.json"),
        }

        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip = re.compile("(\d)(\,)(\d)")
        self.punct = [';', r"/", '[', ']', '"', '{', '}',
                      '(', ')', '=', '+', '\\', '_', '-',
                      '>', '<', '@', '`', ',', '?', '!']

        self.boxes = boxes
        self.test_mode = test_mode
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path

        # load the answer vocab file: same as vqav2 dataset
        with open(answer_vocab_file, 'r', encoding='utf8') as f:
            self.answer_vocab = [w.lower().strip().strip('\r').strip('\n').strip('\r') for w in f.readlines()]
            self.answer_vocab = list(filter(lambda x: x != '', self.answer_vocab))
            self.answer_vocab = [self.processPunctuation(w) for w in self.answer_vocab]

        # The config.DATA.TRAIN_IMAGE_SET and config.DATA.VAL_IMAGE_SET have
        # a little different use here, it indicates the mode 'train' or 'val'
        self.image_sets = [iset.strip() for iset in image_set.split('+')]
        self.ann_files = [os.path.join(data_path, vqa_annot[iset]) for iset in self.image_sets] \
            if not self.test_mode else [None for iset in self.image_sets]
        self.q_files = [os.path.join(data_path, vqa_question[iset]) for iset in self.image_sets]

        self.precomputed_box_files = [
            os.path.join(data_path, precomputed_boxes[iset][0], precomputed_boxes[iset][1]) for iset in self.image_sets]

        self.box_bank = {}
        self.coco_datasets = [os.path.join(data_path, '{}', 'COCO_{}_{{:012d}}.jpg') for iset in self.image_sets]

        self.transform = transform
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size

        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)

        # toy dataset
        if toy_dataset:
            print(f"Using the toy dataset!! Total samples = {toy_samples}")
            self.database = self.database[:toy_samples]

    @property
    def data_names(self):
        if self.test_mode:
            return ['image', 'boxes', 'im_info', 'question']
        else:
            return ['image', 'boxes', 'im_info', 'question', 'label']

    def __getitem__(self, index):
        idb = self.database[index]

        # image, boxes, im_info
        boxes_data = self._load_json(idb['box_fn'])
        if self.with_precomputed_visual_feat:
            image = None
            w0, h0 = idb['width'], idb['height']

            boxes_features = torch.as_tensor(
                np.frombuffer(self.b64_decode(boxes_data['features']), dtype=np.float32).reshape((boxes_data['num_boxes'], -1))
            )
        else:
            image = self._load_image(idb['image_fn'])
            w0, h0 = image.size
        boxes = torch.as_tensor(
            np.frombuffer(self.b64_decode(boxes_data['boxes']), dtype=np.float32).reshape(
                (boxes_data['num_boxes'], -1))
        )

        if self.add_image_as_a_box:
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1]])
            boxes = torch.cat((image_box, boxes), dim=0)
            if self.with_precomputed_visual_feat:
                if 'image_box_feature' in boxes_data:
                    image_box_feature = torch.as_tensor(
                        np.frombuffer(
                            self.b64_decode(boxes_data['image_box_feature']), dtype=np.float32
                        ).reshape((1, -1))
                    )
                else:
                    image_box_feature = boxes_features.mean(0, keepdim=True)
                boxes_features = torch.cat((image_box_feature, boxes_features), dim=0)
        im_info = torch.tensor([w0, h0, 1.0, 1.0])
        flipped = False
        if self.transform is not None:
            image, boxes, _, im_info, flipped = self.transform(image, boxes, None, im_info, flipped)

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)

        # flip: 'left' -> 'right', 'right' -> 'left'
        q_tokens = self.tokenizer.tokenize(idb['question'])
        if flipped:
            q_tokens = self.flip_tokens(q_tokens, verbose=False)
        if not self.test_mode:
            answers = idb['answers']
            if flipped:
                answers_tokens = [a.split(' ') for a in answers]
                answers_tokens = [self.flip_tokens(a_toks, verbose=False) for a_toks in answers_tokens]
                answers = [' '.join(a_toks) for a_toks in answers_tokens]
            label = self.get_soft_target(answers)

        # question
        q_retokens = q_tokens
        q_ids = self.tokenizer.convert_tokens_to_ids(q_retokens)

        # concat box feature to box
        if self.with_precomputed_visual_feat:
            boxes = torch.cat((boxes, boxes_features), dim=-1)

        if self.test_mode:
            return image, boxes, im_info, q_ids
        else:
            # print([(self.answer_vocab[i], p.item()) for i, p in enumerate(label) if p.item() != 0])
            return image, boxes, im_info, q_ids, label

    @staticmethod
    def flip_tokens(tokens, verbose=True):
        changed = False
        tokens_new = [tok for tok in tokens]
        for i, tok in enumerate(tokens):
            if tok == 'left':
                tokens_new[i] = 'right'
                changed = True
            elif tok == 'right':
                tokens_new[i] = 'left'
                changed = True
        if verbose and changed:
            logging.info('[Tokens Flip] {} -> {}'.format(tokens, tokens_new))
        return tokens_new

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    def answer_to_ind(self, answer):
        if answer in self.answer_vocab:
            return self.answer_vocab.index(answer)
        else:
            return self.answer_vocab.index('<unk>')

    def get_soft_target(self, answers):

        soft_target = torch.zeros(len(self.answer_vocab), dtype=torch.float)
        answer_indices = [self.answer_to_ind(answer) for answer in answers]
        gt_answers = list(enumerate(answer_indices))
        unique_answers = set(answer_indices)

        for answer in unique_answers:
            accs = []
            for gt_answer in gt_answers:
                other_answers = [item for item in gt_answers if item != gt_answer]

                matching_answers = [item for item in other_answers if item[1] == answer]
                acc = min(1, float(len(matching_answers)) / 3)
                accs.append(acc)
            avg_acc = sum(accs) / len(accs)

            if answer != self.answer_vocab.index('<unk>'):
                soft_target[answer] = avg_acc

        return soft_target

    def processPunctuation(self, inText):

        if inText == '<unk>':
            return inText

        outText = inText
        for p in self.punct:
            if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
                outText = outText.replace(p, '')
            else:
                outText = outText.replace(p, ' ')
        outText = self.periodStrip.sub("",
                                       outText,
                                       re.UNICODE)
        return outText

    def load_annotations(self):
        tic = time.time()
        database = []
        db_cache_name = 'vqa_cp2_boxes{}_{}'.format(self.boxes, '+'.join(self.image_sets))
        if self.with_precomputed_visual_feat:
            db_cache_name += 'visualprecomp'
        if self.zip_mode:
            db_cache_name = db_cache_name + '_zipmode'
        if self.test_mode:
            db_cache_name = db_cache_name + '_testmode'
        db_cache_root = os.path.join(self.root_path, 'cache')
        db_cache_path = os.path.join(db_cache_root, '{}.pkl'.format(db_cache_name))

        if os.path.exists(db_cache_path):
            if not self.ignore_db_cache:
                # reading cached database
                print('cached database found in {}.'.format(db_cache_path))
                with open(db_cache_path, 'rb') as f:
                    print('loading cached database from {}...'.format(db_cache_path))
                    tic = time.time()
                    database = cPickle.load(f)
                    print('Done (t={:.2f}s)'.format(time.time() - tic))
                    return database
            else:
                print('cached database ignored.')

        # ignore or not find cached database, reload it from annotation file
        print('loading database of split {}...'.format('+'.join(self.image_sets)))
        tic = time.time()

        for ann_file, q_file, coco_path, box_file \
                in zip(self.ann_files, self.q_files, self.coco_datasets, self.precomputed_box_files):
            qs = self._load_json(q_file)
            anns = self._load_json(ann_file) if not self.test_mode else ([None] * len(qs))

            # we need to create 3 coco objects
            coco_train2014 = COCO(self.coco_dataset['train2014'])
            coco_val2014 = COCO(self.coco_dataset['val2014'])
            coco_test2015 = COCO(self.coco_dataset['test2015'])
            for ann, q in zip(anns, qs):
                if q['coco_split'] == 'train2014':
                    coco_obj = coco_train2014
                    box_dir = 'trainval2014'
                elif q['coco_split'] == 'val2014':
                    coco_obj = coco_val2014
                    box_dir = 'trainval2014'
                elif q['coco_split'] == 'test2015':
                    coco_obj = coco_test2015
                    box_dir = 'test2015'
                else:
                    raise ValueError("COCO split in question : {} not supported".format(q['coco_split']))

                idb = {'image_id': q['image_id'],
                       'image_fn': coco_path.format(q['coco_split'], q['coco_split'], q['image_id']),
                       'width': coco_obj.imgs[q['image_id']]['width'],
                       'height': coco_obj.imgs[q['image_id']]['height'],
                       'box_fn': os.path.join(box_file.format(box_dir), '{}.json'.format(q['image_id'])),
                       'question_id': q['question_id'],
                       'question': q['question'],
                       'answers': [a['answer'] for a in ann['answers']] if not self.test_mode else None,
                       'multiple_choice_answer': ann['multiple_choice_answer'] if not self.test_mode else None,
                       "question_type": ann['question_type'] if not self.test_mode else None,
                       "answer_type": ann['answer_type'] if not self.test_mode else None,
                       }
                database.append(idb)

        print('Done (t={:.2f}s)'.format(time.time() - tic))

        # cache database via cPickle
        if self.cache_db:
            print('caching database to {}...'.format(db_cache_path))
            tic = time.time()
            if not os.path.exists(db_cache_root):
                makedirsExist(db_cache_root)
            with open(db_cache_path, 'wb') as f:
                cPickle.dump(database, f)
            print('Done (t={:.2f}s)'.format(time.time() - tic))

        return database

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def load_precomputed_boxes(self, box_file):
        if box_file in self.box_bank:
            return self.box_bank[box_file]
        else:
            in_data = {}
            with open(box_file, "r") as tsv_in_file:
                reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES)
                for item in reader:
                    item['image_id'] = int(item['image_id'])
                    item['image_h'] = int(item['image_h'])
                    item['image_w'] = int(item['image_w'])
                    item['num_boxes'] = int(item['num_boxes'])
                    for field in (['boxes', 'features'] if self.with_precomputed_visual_feat else ['boxes']):
                        item[field] = np.frombuffer(base64.decodebytes(item[field].encode()),
                                                    dtype=np.float32).reshape((item['num_boxes'], -1))
                    in_data[item['image_id']] = item
            self.box_bank[box_file] = in_data
            return in_data

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #3
0
class Multi30kDataset2018(Dataset):
    def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64,
                 with_precomputed_visual_feat=False, mask_raw_pixels=True,
                 with_rel_task=True, with_mlm_task=False, with_mvrc_task=False,
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False, languages_used='first', MLT_vocab='bert-base-german-cased-vocab.txt', **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Multi30kDataset2018, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        # TODO: need to remove this to allows testing
        # assert not test_mode

        annot = {'train': 'train_MLT_frcnn.json',
                 'val': 'val_MLT_frcnn.json',
                 'test2015': 'test_MLT_2018_renamed_frcnn.json'}

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        #FM edit: added option for how many captions
        self.languages_used = languages_used
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset
        self.database = list(jsonlines.open(self.ann_file))
        if not self.zip_mode:
            for i, idb in enumerate(self.database):
                self.database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\
                    .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '')
                self.database[i]['image'] = idb['image'].replace('.zip@', '')


        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)

        #FM: initialise vocabulary for output
        self.MLT_vocab_path = os.path.join(root_path, 'model/pretrained_model', MLT_vocab)
        self.MLT_vocab = []
        with open(self.MLT_vocab_path) as fp:
            for cnt, line in enumerate(fp):
                self.MLT_vocab.append(line.strip())


    @property
    def data_names(self):
        return ['image', 'boxes', 'im_info', 'text',
                'relationship_label', 'mlm_labels', 'mvrc_ops', 'mvrc_labels']

    def __getitem__(self, index):
        idb = self.database[index]

        # image data
        # IN ALL CASES: boxes and cls scores are available for each image
        frcnn_data = self._load_json(os.path.join(self.data_path, idb['frcnn']))
        boxes = np.frombuffer(self.b64_decode(frcnn_data['boxes']),
                              dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
        boxes_cls_scores = np.frombuffer(self.b64_decode(frcnn_data['classes']),
                                         dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
        boxes_max_conf = boxes_cls_scores.max(axis=1)
        inds = np.argsort(boxes_max_conf)[::-1]
        boxes = boxes[inds]
        boxes_cls_scores = boxes_cls_scores[inds]
        boxes = torch.as_tensor(boxes)

        # load precomputed features or the whole image depending on setup
        if self.with_precomputed_visual_feat:
            image = None
            w0, h0 = frcnn_data['image_w'], frcnn_data['image_h']
            boxes_features = np.frombuffer(self.b64_decode(frcnn_data['features']),
                                           dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
            boxes_features = boxes_features[inds]
            boxes_features = torch.as_tensor(boxes_features)
        else:
            try:
                image = self._load_image(os.path.join(self.data_path, idb['image']))
                w0, h0 = image.size
            except:
                print("Failed to load image {}, use zero image!".format(idb['image']))
                image = None
                w0, h0 = frcnn_data['image_w'], frcnn_data['image_h']

        # append whole image to tensor of boxes (used for all linguistic tokens)
        if self.add_image_as_a_box:
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1.0, h0 - 1.0]])
            boxes = torch.cat((image_box, boxes), dim=0)
            if self.with_precomputed_visual_feat:
                image_box_feat = boxes_features.mean(dim=0, keepdim=True)
                boxes_features = torch.cat((image_box_feat, boxes_features), dim=0)

        # transform
        im_info = torch.tensor([w0, h0, 1.0, 1.0, index])
        if self.transform is not None:
            image, boxes, _, im_info = self.transform(image, boxes, None, im_info)

        if image is None and (not self.with_precomputed_visual_feat):
            w = int(im_info[0].item())
            h = int(im_info[1].item())
            image = im_info.new_zeros((3, h, w), dtype=torch.float)

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w-1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h-1)

        # FM edit: remove - Task #1: Caption-Image Relationship Prediction
        word_en = idb['word_en']
        word_de = idb['word_de']
        caption_en = idb['caption_en']
        caption_de = idb['caption_de']
        
         # FM edit: add captions - tokenise words
        caption_tokens_en = self.tokenizer.tokenize(caption_en)
        # caption_tokens_de = self.tokenizer.tokenize(caption_de)
        word_tokens_en = self.tokenizer.tokenize(word_en)
        # word_tokens_de = self.tokenizer.tokenize(word_de)
        mlm_labels_en = [-1] * len(caption_tokens_en)
        mlm_labels_word_en = [-1] * len(caption_tokens_en)
        # mlm_labels_word_de = [-1] * len(caption_tokens_de)
        # mlm_labels_de = [-1] * len(caption_tokens_de)
        
        text_tokens = ['[CLS]'] + word_tokens_en + ['[SEP]'] + caption_tokens_en + ['[SEP]']
        mlm_labels = [-1] + mlm_labels_word_en + [-1] + mlm_labels_en + [-1]

        # relationship label - not used
        relationship_label = 1

        # Construct boxes
        mvrc_ops = [0] * boxes.shape[0]
        mvrc_labels = [np.zeros_like(boxes_cls_scores[0])] * boxes.shape[0]

        # store labels for masked regions
        mvrc_labels = np.stack(mvrc_labels, axis=0)

        # convert tokens to ids 
        text = self.tokenizer.convert_tokens_to_ids(text_tokens)

        if self.with_precomputed_visual_feat:
            boxes = torch.cat((boxes, boxes_features), dim=1)

        # truncate seq to max len
        if len(text) + len(boxes) > self.seq_len:
            text_len_keep = len(text)
            box_len_keep = len(boxes)
            while (text_len_keep + box_len_keep) > self.seq_len and (text_len_keep > 0) and (box_len_keep > 0):
                if box_len_keep > text_len_keep:
                    box_len_keep -= 1
                else:
                    text_len_keep -= 1
            if text_len_keep < 2:
                text_len_keep = 2
            if box_len_keep < 1:
                box_len_keep = 1
            boxes = boxes[:box_len_keep]
            text = text[:(text_len_keep - 1)] + [text[-1]]
            mlm_labels = mlm_labels[:(text_len_keep - 1)] + [mlm_labels[-1]]
            mvrc_ops = mvrc_ops[:box_len_keep]
            mvrc_labels = mvrc_labels[:box_len_keep]


        return image, boxes, im_info, text, relationship_label, mlm_labels, mvrc_ops, mvrc_labels

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #4
0
class RefCOCO(Dataset):
    def __init__(self,
                 image_set,
                 root_path,
                 data_path,
                 boxes='gt',
                 proposal_source='official',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        RefCOCO+ Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(RefCOCO, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        categories = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle',
            'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight',
            'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat',
            'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
            'frisbee', 'skis', 'snowboard', 'sportsball', 'kite',
            'baseballbat', 'baseballglove', 'skateboard', 'surfboard',
            'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair',
            'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv',
            'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave',
            'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
            'scissors', 'teddybear', 'hairdrier', 'toothbrush'
        ]

        coco_annot_files = {
            "train2014": "annotations/instances_train2014.json",
            "val2014": "annotations/instances_val2014.json",
            "test2015": "annotations/image_info_test2015.json",
        }
        proposal_dets = 'refcoco+/proposal/res101_coco_minus_refer_notime_dets.json'
        proposal_masks = 'refcoco+/proposal/res101_coco_minus_refer_notime_masks.json'
        self.vg_proposal = ("vgbua_res101_precomputed",
                            "trainval2014_resnet101_faster_rcnn_genome")
        self.proposal_source = proposal_source
        self.boxes = boxes
        self.test_mode = test_mode
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.image_sets = [iset.strip() for iset in image_set.split('+')]
        self.coco = COCO(annotation_file=os.path.join(
            data_path, coco_annot_files['train2014']))
        self.refer = REFER(data_path, dataset='refcoco+', splitBy='unc')
        self.refer_ids = []
        for iset in self.image_sets:
            self.refer_ids.extend(self.refer.getRefIds(split=iset))
        self.refs = self.refer.loadRefs(ref_ids=self.refer_ids)
        if 'proposal' in boxes:
            with open(os.path.join(data_path, proposal_dets), 'r') as f:
                proposal_list = json.load(f)
            self.proposals = {}
            for proposal in proposal_list:
                image_id = proposal['image_id']
                if image_id in self.proposals:
                    self.proposals[image_id].append(proposal['box'])
                else:
                    self.proposals[image_id] = [proposal['box']]
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)

    @property
    def data_names(self):
        if self.test_mode:
            return ['image', 'boxes', 'im_info', 'expression']
        else:
            return ['image', 'boxes', 'im_info', 'expression', 'label']

    def __getitem__(self, index):
        idb = self.database[index]

        # image related
        img_id = idb['image_id']
        image = self._load_image(idb['image_fn'])
        im_info = torch.as_tensor([idb['width'], idb['height'], 1.0, 1.0])
        if not self.test_mode:
            gt_box = torch.as_tensor(idb['gt_box'])
        flipped = False
        if self.boxes == 'gt':
            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            anns = self.coco.loadAnns(ann_ids)
            boxes = []
            for ann in anns:
                x_, y_, w_, h_ = ann['bbox']
                boxes.append([x_, y_, x_ + w_, y_ + h_])
            boxes = torch.as_tensor(boxes)
        elif self.boxes == 'proposal':
            if self.proposal_source == 'official':
                boxes = torch.as_tensor(self.proposals[img_id])
                boxes[:, [2, 3]] += boxes[:, [0, 1]]
            elif self.proposal_source == 'vg':
                box_file = os.path.join(
                    self.data_path, self.vg_proposal[0],
                    '{0}.zip@/{0}'.format(self.vg_proposal[1]))
                boxes_fn = os.path.join(box_file,
                                        '{}.json'.format(idb['image_id']))
                boxes_data = self._load_json(boxes_fn)
                boxes = torch.as_tensor(
                    np.frombuffer(self.b64_decode(boxes_data['boxes']),
                                  dtype=np.float32).reshape(
                                      (boxes_data['num_boxes'], -1)))
            else:
                raise NotImplemented
        elif self.boxes == 'proposal+gt' or self.boxes == 'gt+proposal':
            if self.proposal_source == 'official':
                boxes = torch.as_tensor(self.proposals[img_id])
                boxes[:, [2, 3]] += boxes[:, [0, 1]]
            elif self.proposal_source == 'vg':
                box_file = os.path.join(
                    self.data_path, self.vg_proposal[0],
                    '{0}.zip@/{0}'.format(self.vg_proposal[1]))
                boxes_fn = os.path.join(box_file,
                                        '{}.json'.format(idb['image_id']))
                boxes_data = self._load_json(boxes_fn)
                boxes = torch.as_tensor(
                    np.frombuffer(self.b64_decode(boxes_data['boxes']),
                                  dtype=np.float32).reshape(
                                      (boxes_data['num_boxes'], -1)))
            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            anns = self.coco.loadAnns(ann_ids)
            gt_boxes = []
            for ann in anns:
                x_, y_, w_, h_ = ann['bbox']
                gt_boxes.append([x_, y_, x_ + w_, y_ + h_])
            gt_boxes = torch.as_tensor(gt_boxes)
            boxes = torch.cat((boxes, gt_boxes), 0)
        else:
            raise NotImplemented

        if self.add_image_as_a_box:
            w0, h0 = im_info[0], im_info[1]
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1]])
            boxes = torch.cat((image_box, boxes), dim=0)

        if self.transform is not None:
            if not self.test_mode:
                boxes = torch.cat((gt_box[None], boxes), 0)
            image, boxes, _, im_info, flipped = self.transform(
                image, boxes, None, im_info, flipped)
            if not self.test_mode:
                gt_box = boxes[0]
                boxes = boxes[1:]

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)
        if not self.test_mode:
            gt_box[[0, 2]] = gt_box[[0, 2]].clamp(min=0, max=w - 1)
            gt_box[[1, 3]] = gt_box[[1, 3]].clamp(min=0, max=h - 1)

        # assign label to each box by its IoU with gt_box
        if not self.test_mode:
            boxes_ious = bbox_iou_py_vectorized(boxes, gt_box[None]).view(-1)
            label = (boxes_ious > 0.5).float()

        # expression
        exp_tokens = idb['tokens']
        exp_retokens = self.tokenizer.tokenize(' '.join(exp_tokens))
        if flipped:
            exp_retokens = self.flip_tokens(exp_retokens, verbose=True)
        exp_ids = self.tokenizer.convert_tokens_to_ids(exp_retokens)

        if self.test_mode:
            return image, boxes, im_info, exp_ids
        else:
            return image, boxes, im_info, exp_ids, label

    @staticmethod
    def flip_tokens(tokens, verbose=True):
        changed = False
        tokens_new = [tok for tok in tokens]
        for i, tok in enumerate(tokens):
            if tok == 'left':
                tokens_new[i] = 'right'
                changed = True
            elif tok == 'right':
                tokens_new[i] = 'left'
                changed = True
        if verbose and changed:
            logging.info('[Tokens Flip] {} -> {}'.format(tokens, tokens_new))
        return tokens_new

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    def load_annotations(self):
        tic = time.time()
        database = []
        db_cache_name = 'refcoco+_boxes_{}_{}'.format(
            self.boxes, '+'.join(self.image_sets))
        if self.zip_mode:
            db_cache_name = db_cache_name + '_zipmode'
        if self.test_mode:
            db_cache_name = db_cache_name + '_testmode'
        db_cache_root = os.path.join(self.root_path, 'cache')
        db_cache_path = os.path.join(db_cache_root,
                                     '{}.pkl'.format(db_cache_name))

        if os.path.exists(db_cache_path):
            if not self.ignore_db_cache:
                # reading cached database
                print('cached database found in {}.'.format(db_cache_path))
                with open(db_cache_path, 'rb') as f:
                    print('loading cached database from {}...'.format(
                        db_cache_path))
                    tic = time.time()
                    database = cPickle.load(f)
                    print('Done (t={:.2f}s)'.format(time.time() - tic))
                    return database
            else:
                print('cached database ignored.')

        # ignore or not find cached database, reload it from annotation file
        print('loading database of split {}...'.format('+'.join(
            self.image_sets)))
        tic = time.time()

        for ref_id, ref in zip(self.refer_ids, self.refs):
            iset = 'train2014'
            if not self.test_mode:
                gt_x, gt_y, gt_w, gt_h = self.refer.getRefBox(ref_id=ref_id)
            if self.zip_mode:
                image_fn = os.path.join(
                    self.data_path, iset + '.zip@/' + iset,
                    'COCO_{}_{:012d}.jpg'.format(iset, ref['image_id']))
            else:
                image_fn = os.path.join(
                    self.data_path, iset,
                    'COCO_{}_{:012d}.jpg'.format(iset, ref['image_id']))
            for sent in ref['sentences']:
                idb = {
                    'sent_id':
                    sent['sent_id'],
                    'ann_id':
                    ref['ann_id'],
                    'ref_id':
                    ref['ref_id'],
                    'image_id':
                    ref['image_id'],
                    'image_fn':
                    image_fn,
                    'width':
                    self.coco.imgs[ref['image_id']]['width'],
                    'height':
                    self.coco.imgs[ref['image_id']]['height'],
                    'raw':
                    sent['raw'],
                    'sent':
                    sent['sent'],
                    'tokens':
                    sent['tokens'],
                    'category_id':
                    ref['category_id'],
                    'gt_box': [gt_x, gt_y, gt_x + gt_w, gt_y +
                               gt_h] if not self.test_mode else None
                }
                database.append(idb)

        print('Done (t={:.2f}s)'.format(time.time() - tic))

        # cache database via cPickle
        if self.cache_db:
            print('caching database to {}...'.format(db_cache_path))
            tic = time.time()
            if not os.path.exists(db_cache_root):
                makedirsExist(db_cache_root)
            with open(db_cache_path, 'wb') as f:
                cPickle.dump(database, f)
            print('Done (t={:.2f}s)'.format(time.time() - tic))

        return database

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #5
0
class ParallelTextDataset(Dataset):
    def __init__(self,
                 ann_file,
                 image_set,
                 root_path,
                 data_path,
                 seq_len=64,
                 with_precomputed_visual_feat=False,
                 mask_raw_pixels=True,
                 with_rel_task=True,
                 with_mlm_task=True,
                 with_mvrc_task=True,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False,
                 **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(ParallelTextDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert not test_mode

        annot = {
            'train': 'train.json',
            'val': 'test.json',
            'test': 'test.json'
        }

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset
        self.database = list(jsonlines.open(self.ann_file))

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)

    @property
    def data_names(self):
        return ['text', 'relationship_label', 'mlm_labels']

    def __getitem__(self, index):
        idb = self.database[index]

        # Task #1: Caption-Image Relationship Prediction
        _p = random.random()
        if _p < 0.5 or (not self.with_rel_task):
            relationship_label = 1
            caption_en = idb['caption_en']
            caption_de = idb['caption_de']
        else:
            relationship_label = 0
            rand_index = random.randrange(0, len(self.database))
            while rand_index == index:
                rand_index = random.randrange(0, len(self.database))
            caption_en = self.database[rand_index]['caption_en']
            caption_de = self.database[rand_index]['caption_de']

        # Task #2: Masked Language Modeling - Adapted for two languages

        if self.with_mlm_task:
            # FM: removing joining of caption - split into two languages
            caption_tokens_en = self.tokenizer.basic_tokenizer.tokenize(
                caption_en)
            caption_tokens_en, mlm_labels_en = self.random_word_wwm(
                caption_tokens_en)
            caption_tokens_de = self.tokenizer.basic_tokenizer.tokenize(
                caption_de)
            caption_tokens_de, mlm_labels_de = self.random_word_wwm(
                caption_tokens_de)
        else:
            caption_tokens_en = self.tokenizer.tokenize(caption_en)
            caption_tokens_de = self.tokenizer.tokenize(caption_de)
            mlm_labels_en = [-1] * len(caption_tokens_en)
            mlm_labels_de = [-1] * len(caption_tokens_de)

        text_tokens = ['[CLS]'] + caption_tokens_en + [
            '[SEP]'
        ] + caption_tokens_de + ['[SEP]']
        mlm_labels = [-1] + mlm_labels_en + [-1] + mlm_labels_de + [-1]

        # convert tokens to ids
        text = self.tokenizer.convert_tokens_to_ids(text_tokens)

        # truncate seq to max len
        if len(text) > self.seq_len:
            text_len_keep = len(text)
            while (text_len_keep) > self.seq_len and (text_len_keep > 0):
                text_len_keep -= 1
            if text_len_keep < 2:
                text_len_keep = 2
            text = text[:(text_len_keep - 1)] + [text[-1]]

        return text, relationship_label, mlm_labels

    # def random_word(self, tokens):
    #     output_label = []
    #
    #     for i, token in enumerate(tokens):
    #         prob = random.random()
    #         # mask token with 15% probability
    #         if prob < 0.15:
    #             prob /= 0.15
    #
    #             # 80% randomly change token to mask token
    #             if prob < 0.8:
    #                 tokens[i] = "[MASK]"
    #
    #             # 10% randomly change token to random token
    #             elif prob < 0.9:
    #                 tokens[i] = random.choice(list(self.tokenizer.vocab.items()))[0]
    #
    #             # -> rest 10% randomly keep current token
    #
    #             # append current token to output (we will predict these later)
    #             try:
    #                 output_label.append(self.tokenizer.vocab[token])
    #             except KeyError:
    #                 # For unknown words (should not occur with BPE vocab)
    #                 output_label.append(self.tokenizer.vocab["[UNK]"])
    #                 logging.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
    #         else:
    #             # no masking token (will be ignored by loss function later)
    #             output_label.append(-1)
    #
    #     # if no word masked, random choose a word to mask
    #     if self.force_mask:
    #         if all([l_ == -1 for l_ in output_label]):
    #             choosed = random.randrange(0, len(output_label))
    #             output_label[choosed] = self.tokenizer.vocab[tokens[choosed]]
    #
    #     return tokens, output_label

    def random_word_wwm(self, tokens):
        output_tokens = []
        output_label = []

        for i, token in enumerate(tokens):
            sub_tokens = self.tokenizer.wordpiece_tokenizer.tokenize(token)
            prob = random.random()
            # mask token with 15% probability
            if prob < 0.15:
                prob /= 0.15

                # 80% randomly change token to mask token
                if prob < 0.8:
                    for sub_token in sub_tokens:
                        output_tokens.append("[MASK]")
                # 10% randomly change token to random token
                elif prob < 0.9:
                    for sub_token in sub_tokens:
                        output_tokens.append(
                            random.choice(list(self.tokenizer.vocab.keys())))
                        # -> rest 10% randomly keep current token
                else:
                    for sub_token in sub_tokens:
                        output_tokens.append(sub_token)

                        # append current token to output (we will predict these later)
                for sub_token in sub_tokens:
                    try:
                        output_label.append(self.tokenizer.vocab[sub_token])
                    except KeyError:
                        # For unknown words (should not occur with BPE vocab)
                        output_label.append(self.tokenizer.vocab["[UNK]"])
                        logging.warning(
                            "Cannot find sub_token '{}' in vocab. Using [UNK] insetad"
                            .format(sub_token))
            else:
                for sub_token in sub_tokens:
                    # no masking token (will be ignored by loss function later)
                    output_tokens.append(sub_token)
                    output_label.append(-1)

        ## if no word masked, random choose a word to mask
        # if all([l_ == -1 for l_ in output_label]):
        #    choosed = random.randrange(0, len(output_label))
        #    output_label[choosed] = self.tokenizer.vocab[tokens[choosed]]

        return output_tokens, output_label

    def random_mask_region(self, regions_cls_scores):
        num_regions, num_classes = regions_cls_scores.shape
        output_op = []
        output_label = []
        for k, cls_scores in enumerate(regions_cls_scores):
            prob = random.random()
            # mask region with 15% probability
            if prob < 0.15:
                prob /= 0.15

                if prob < 0.9:
                    # 90% randomly replace appearance feature by "MASK"
                    output_op.append(1)
                else:
                    # -> rest 10% randomly keep current appearance feature
                    output_op.append(0)

                # append class of region to output (we will predict these later)
                output_label.append(cls_scores)
            else:
                # no masking region (will be ignored by loss function later)
                output_op.append(0)
                output_label.append(np.zeros_like(cls_scores))

        # # if no region masked, random choose a region to mask
        # if all([op == 0 for op in output_op]):
        #     choosed = random.randrange(0, len(output_op))
        #     output_op[choosed] = 1
        #     output_label[choosed] = regions_cls_scores[choosed]

        return output_op, output_label

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #6
0
class COCOCaptionsDataset(Dataset):
    def __init__(self,
                 ann_file,
                 image_set,
                 root_path,
                 data_path,
                 seq_len=64,
                 with_precomputed_visual_feat=False,
                 mask_raw_pixels=True,
                 with_rel_task=True,
                 with_mlm_task=True,
                 with_mvrc_task=True,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False,
                 **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(COCOCaptionsDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert not test_mode

        annot = {
            'train': 'annotations/captions_train2017.json',
            'val': 'annotations/captions_val2017.json'
        }
        annot_inst = {
            'train': 'annotations/instances_train2017.json',
            'val': 'annotations/instances_val2017.json'
        }
        if zip_mode:
            self.root = os.path.join(data_path,
                                     '{0}2017.zip@/{0}2017'.format(image_set))
        else:
            self.root = os.path.join(data_path, '{}2017'.format(image_set))

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.ann_file_inst = os.path.join(data_path, annot_inst[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if self.zip_mode:
            self.zipreader = ZipReader()

        self.coco = COCO(self.ann_file)
        self.coco_inst = COCO(self.ann_file_inst)
        self.ids = list(sorted(self.coco.imgs.keys()))
        # filter images without detection annotations
        self.ids = [
            img_id for img_id in self.ids
            if len(self.coco_inst.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
        ]

        self.json_category_id_to_contiguous_id = {
            v: i + 1
            for i, v in enumerate(self.coco_inst.getCatIds())
        }
        self.contiguous_category_id_to_json_id = {
            v: k
            for k, v in self.json_category_id_to_contiguous_id.items()
        }
        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            # self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)

    @property
    def data_names(self):
        return [
            'image', 'boxes', 'im_info', 'text', 'relationship_label',
            'mlm_labels', 'mvrc_ops', 'mvrc_labels'
        ]

    def __getitem__(self, index):
        img_id = self.ids[index]

        # image data
        # frcnn_data = self._load_json(os.path.join(self.data_path, idb['frcnn']))
        # boxes = np.frombuffer(self.b64_decode(frcnn_data['boxes']),
        #                       dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
        # boxes_cls_scores = np.frombuffer(self.b64_decode(frcnn_data['classes']),
        #                                  dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
        # boxes_max_conf = boxes_cls_scores.max(axis=1)
        # inds = np.argsort(boxes_max_conf)[::-1]
        # boxes = boxes[inds]
        # boxes_cls_scores = boxes_cls_scores[inds]
        # boxes = torch.as_tensor(boxes)
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        ann_ids_inst = self.coco_inst.getAnnIds(imgIds=img_id)
        anns_inst = self.coco_inst.loadAnns(ann_ids_inst)
        idb = anns[0]
        boxes = [ann_['bbox'] for ann_ in anns_inst]
        boxes = torch.as_tensor(boxes).reshape(-1, 4)
        TO_REMOVE = 1
        xmin, ymin, w, h = boxes.split(1, dim=-1)
        xmax = xmin + (w - TO_REMOVE).clamp(min=0)
        ymax = ymin + (h - TO_REMOVE).clamp(min=0)
        boxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
        boxes_cls_scores = boxes.new_zeros((boxes.shape[0], 81))
        classes = [ann["category_id"] for ann in anns_inst]
        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
        for i, class_ in enumerate(classes):
            boxes_cls_scores[i, class_] = 1.0

        if self.with_precomputed_visual_feat:
            assert False
            # image = None
            # w0, h0 = frcnn_data['image_w'], frcnn_data['image_h']
            # boxes_features = np.frombuffer(self.b64_decode(frcnn_data['features']),
            #                                dtype=np.float32).reshape((frcnn_data['num_boxes'], -1))
            # boxes_features = boxes_features[inds]
            # boxes_features = torch.as_tensor(boxes_features)
        else:
            path = self.coco_inst.loadImgs(img_id)[0]['file_name']
            image = self._load_image(os.path.join(self.root, path))
            w0, h0 = image.size

        if self.add_image_as_a_box:
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1.0, h0 - 1.0]])
            boxes = torch.cat((image_box, boxes), dim=0)
            if self.with_precomputed_visual_feat:
                assert False
                # image_box_feat = boxes_features.mean(dim=0, keepdim=True)
                # boxes_features = torch.cat((image_box_feat, boxes_features), dim=0)

        # transform
        im_info = torch.tensor([w0, h0, 1.0, 1.0, index])
        if self.transform is not None:
            image, boxes, _, im_info = self.transform(image, boxes, None,
                                                      im_info)

        if image is None and (not self.with_precomputed_visual_feat):
            assert False
            # w = int(im_info[0].item())
            # h = int(im_info[1].item())
            # image = im_info.new_zeros((3, h, w), dtype=torch.float)

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)

        # Task #1: Caption-Image Relationship Prediction
        _p = random.random()
        if _p < 0.5 or (not self.with_rel_task):
            relationship_label = 1
            caption = idb['caption']
        else:
            assert False
            relationship_label = 0
            rand_index = random.randrange(0, len(self.database))
            while rand_index == index:
                rand_index = random.randrange(0, len(self.database))
            caption = self.database[rand_index]['caption']

        assert isinstance(caption, str)

        # Task #2: Masked Language Modeling
        if self.with_mlm_task:
            caption_tokens = self.tokenizer.basic_tokenizer.tokenize(caption)
            caption_tokens, mlm_labels = self.random_word_wwm(caption_tokens)
        else:
            caption_tokens = self.tokenizer.tokenize(caption)
            mlm_labels = [-1] * len(caption_tokens)
        text_tokens = ['[CLS]'] + caption_tokens + ['[SEP]']
        mlm_labels = [-1] + mlm_labels + [-1]

        # Task #3: Masked Visual Region Classification
        if self.with_mvrc_task:
            if self.add_image_as_a_box:
                mvrc_ops, mvrc_labels = self.random_mask_region(
                    boxes_cls_scores)
                mvrc_ops = [0] + mvrc_ops
                mvrc_labels = [np.zeros_like(boxes_cls_scores[0])
                               ] + mvrc_labels
                num_real_boxes = boxes.shape[0] - 1
                num_masked_boxes = 0
                if self.with_precomputed_visual_feat:
                    assert False
                    # boxes_features[0] *= num_real_boxes
                    # for mvrc_op, box_feat in zip(mvrc_ops, boxes_features):
                    #     if mvrc_op == 1:
                    #         num_masked_boxes += 1
                    #         boxes_features[0] -= box_feat
                    # boxes_features[0] /= (num_real_boxes - num_masked_boxes + 1e-5)
            else:
                mvrc_ops, mvrc_labels = self.random_mask_region(
                    boxes_cls_scores)
            assert len(mvrc_ops) == boxes.shape[0], \
                "Error: mvrc_ops have length {}, expected {}!".format(len(mvrc_ops), boxes.shape[0])
            assert len(mvrc_labels) == boxes.shape[0], \
                "Error: mvrc_labels have length {}, expected {}!".format(len(mvrc_labels), boxes.shape[0])
        else:
            mvrc_ops = [0] * boxes.shape[0]
            mvrc_labels = [np.zeros_like(boxes_cls_scores[0])] * boxes.shape[0]

        # zero out pixels of masked RoI
        if (not self.with_precomputed_visual_feat) and self.mask_raw_pixels:
            for mvrc_op, box in zip(mvrc_ops, boxes):
                if mvrc_op == 1:
                    x1, y1, x2, y2 = box
                    image[:, int(y1):(int(y2) + 1), int(x1):(int(x2) + 1)] = 0

        mvrc_labels = np.stack(mvrc_labels, axis=0)

        text = self.tokenizer.convert_tokens_to_ids(text_tokens)

        if self.with_precomputed_visual_feat:
            assert False
            # boxes = torch.cat((boxes, boxes_features), dim=1)

        # truncate seq to max len
        if len(text) + len(boxes) > self.seq_len:
            text_len_keep = len(text)
            box_len_keep = len(boxes)
            while (text_len_keep + box_len_keep) > self.seq_len:
                if box_len_keep > text_len_keep:
                    box_len_keep -= 1
                else:
                    text_len_keep -= 1
            boxes = boxes[:box_len_keep]
            text = text[:text_len_keep]
            mlm_labels = mlm_labels[:text_len_keep]
            mvrc_ops = mvrc_ops[:box_len_keep]
            mvrc_labels = mvrc_labels[:box_len_keep]

        return image, boxes, im_info, text, relationship_label, mlm_labels, mvrc_ops, mvrc_labels

    # def random_word(self, tokens):
    #     output_label = []
    #
    #     for i, token in enumerate(tokens):
    #         prob = random.random()
    #         # mask token with 15% probability
    #         if prob < 0.15:
    #             prob /= 0.15
    #
    #             # 80% randomly change token to mask token
    #             if prob < 0.8:
    #                 tokens[i] = "[MASK]"
    #
    #             # 10% randomly change token to random token
    #             elif prob < 0.9:
    #                 tokens[i] = random.choice(list(self.tokenizer.vocab.items()))[0]
    #
    #             # -> rest 10% randomly keep current token
    #
    #             # append current token to output (we will predict these later)
    #             try:
    #                 output_label.append(self.tokenizer.vocab[token])
    #             except KeyError:
    #                 # For unknown words (should not occur with BPE vocab)
    #                 output_label.append(self.tokenizer.vocab["[UNK]"])
    #                 logging.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
    #         else:
    #             # no masking token (will be ignored by loss function later)
    #             output_label.append(-1)
    #
    #     # if no word masked, random choose a word to mask
    #     if self.force_mask:
    #         if all([l_ == -1 for l_ in output_label]):
    #             choosed = random.randrange(0, len(output_label))
    #             output_label[choosed] = self.tokenizer.vocab[tokens[choosed]]
    #
    #     return tokens, output_label

    def random_word_wwm(self, tokens):
        output_tokens = []
        output_label = []

        for i, token in enumerate(tokens):
            sub_tokens = self.tokenizer.wordpiece_tokenizer.tokenize(token)
            prob = random.random()
            # mask token with 15% probability
            if prob < 0.15:
                prob /= 0.15

                # 80% randomly change token to mask token
                if prob < 0.8:
                    for sub_token in sub_tokens:
                        output_tokens.append("[MASK]")
                # 10% randomly change token to random token
                elif prob < 0.9:
                    for sub_token in sub_tokens:
                        output_tokens.append(
                            random.choice(list(self.tokenizer.vocab.keys())))
                        # -> rest 10% randomly keep current token
                else:
                    for sub_token in sub_tokens:
                        output_tokens.append(sub_token)

                        # append current token to output (we will predict these later)
                for sub_token in sub_tokens:
                    try:
                        output_label.append(self.tokenizer.vocab[sub_token])
                    except KeyError:
                        # For unknown words (should not occur with BPE vocab)
                        output_label.append(self.tokenizer.vocab["[UNK]"])
                        logging.warning(
                            "Cannot find sub_token '{}' in vocab. Using [UNK] insetad"
                            .format(sub_token))
            else:
                for sub_token in sub_tokens:
                    # no masking token (will be ignored by loss function later)
                    output_tokens.append(sub_token)
                    output_label.append(-1)

        ## if no word masked, random choose a word to mask
        # if all([l_ == -1 for l_ in output_label]):
        #    choosed = random.randrange(0, len(output_label))
        #    output_label[choosed] = self.tokenizer.vocab[tokens[choosed]]

        return output_tokens, output_label

    def random_mask_region(self, regions_cls_scores):
        num_regions, num_classes = regions_cls_scores.shape
        output_op = []
        output_label = []
        for k, cls_scores in enumerate(regions_cls_scores):
            prob = random.random()
            # mask region with 15% probability
            if prob < 0.15:
                prob /= 0.15

                if prob < 0.9:
                    # 90% randomly replace appearance feature by "MASK"
                    output_op.append(1)
                else:
                    # -> rest 10% randomly keep current appearance feature
                    output_op.append(0)

                # append class of region to output (we will predict these later)
                output_label.append(cls_scores)
            else:
                # no masking region (will be ignored by loss function later)
                output_op.append(0)
                output_label.append(np.zeros_like(cls_scores))

        # # if no region masked, random choose a region to mask
        # if all([op == 0 for op in output_op]):
        #     choosed = random.randrange(0, len(output_op))
        #     output_op[choosed] = 1
        #     output_label[choosed] = regions_cls_scores[choosed]

        return output_op, output_label

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.ids)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #7
0
class VRep(Dataset):
    def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official',
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, **kwargs):
        """
        VREP Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(VRep, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        self.data_json = 'obj_det_res.json'#'image_seg_test.json'#'obj_det_res.json'
        self.ref_json = 'ref_annotations.json'
        self.boxes = boxes
        self.refer = Refer()
        self.test_mode = test_mode
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)

    @property
    def data_names(self):
        if self.test_mode:
            return ['image', 'boxes', 'im_info', 'expression']
        else:
            return ['image', 'boxes', 'im_info', 'expression', 'label']

    def __getitem__(self, index):
        idb = self.database[index]

	#print(idb)

        # image related
        img_id = idb['image_id']
        image = self._load_image(idb['image_fn'])
        im_info = torch.as_tensor([idb['width'], idb['height'], 1.0, 1.0])
        if not self.test_mode:
            gt_box = torch.as_tensor(idb['gt_box'])
        flipped = False
        bb = self._load_json(os.path.join(self.data_path, 'bb.json'))
        if self.boxes == 'gt':
            boxes = torch.as_tensor(bb[img_id])

        if self.add_image_as_a_box:
            w0, h0 = im_info[0], im_info[1]
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1]])
            boxes = torch.cat((image_box, boxes), dim=0)

        if self.transform is not None:
            if not self.test_mode:
                boxes = torch.cat((gt_box[None], boxes), 0)
            image, boxes, _, im_info, flipped = self.transform(image, boxes, None, im_info, flipped)
            if not self.test_mode:
                gt_box = boxes[0]
                boxes = boxes[1:]

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)
        if not self.test_mode:
            gt_box[[0, 2]] = gt_box[[0, 2]].clamp(min=0, max=w - 1)
            gt_box[[1, 3]] = gt_box[[1, 3]].clamp(min=0, max=h - 1)

        # assign label to each box by its IoU with gt_box
        if not self.test_mode:
            boxes_ious = bbox_iou_py_vectorized(boxes, gt_box[None]).view(-1)
            label = (boxes_ious > 0.5).float()

        # expression
        exp_tokens = idb['tokens']
        exp_retokens = self.tokenizer.tokenize(' '.join(exp_tokens))
        if flipped:
            exp_retokens = self.flip_tokens(exp_retokens, verbose=True)
        exp_ids = self.tokenizer.convert_tokens_to_ids(exp_retokens)

        if self.test_mode:
            return image, boxes, im_info, exp_ids
        else:
            return image, boxes, im_info, exp_ids, label

    @staticmethod
    def flip_tokens(tokens, verbose=True):
        changed = False
        tokens_new = [tok for tok in tokens]
        for i, tok in enumerate(tokens):
            if tok == 'left':
                tokens_new[i] = 'right'
                changed = True
            elif tok == 'right':
                tokens_new[i] = 'left'
                changed = True
        if verbose and changed:
            logging.info('[Tokens Flip] {} -> {}'.format(tokens, tokens_new))
        return tokens_new

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    def load_annotations(self):
        tic = time.time()
        database = []
        db_cache_name = 'vrep_boxes'#_{}_{}'.format(self.boxes, '+'.join(self.image_sets))
        if self.zip_mode:
            db_cache_name = db_cache_name + '_zipmode'
        if self.test_mode:
            db_cache_name = db_cache_name + '_testmode'
        db_cache_root = os.path.join(self.root_path, 'cache')
        db_cache_path = os.path.join(db_cache_root, '{}.pkl'.format(db_cache_name))
        dataset = self._load_json(os.path.join(self.data_path, self.data_json))
        ref = self._load_json(os.path.join(self.data_path, self.ref_json))
        if os.path.exists(db_cache_path):
            if not self.ignore_db_cache:
                # reading cached database
                print('cached database found in {}.'.format(db_cache_path))
                with open(db_cache_path, 'rb') as f:
                    print('loading cached database from {}...'.format(db_cache_path))
                    tic = time.time()
                    database = cPickle.load(f)
                    print('Done (t={:.2f}s)'.format(time.time() - tic))
                    return database
            else:
                print('cached database ignored.')

        # ignore or not find cached database, reload it from annotation file
        #print('loading database of split {}...'.format('+'.join(self.image_sets)))
        tic = time.time()

        refer_id = 0 
	
        for data_point in dataset['images']:
            iset  = 'full_images'
            image_name = data_point['file_name'].split('/')[3]
            if True:
            	for anno in data_point['annotations']:
                    if anno['id'] == data_point['ground_truth']:
                        gt_x, gt_y, gt_w, gt_h = anno['bbox']
            if self.zip_mode:
                image_fn = os.path.join(self.data_path, iset + '.zip@/' + iset, image_name)
            else:
                image_fn = os.path.join(self.data_path, iset, image_name)
            for sent in ref[image_name]:
                idb = {
                    #'sent_id': sent['sent_id'],
                    #'ann_id': ref['ann_id'],
                    'ref_id': refer_id,
                    'image_id': image_name,
                    'image_fn': image_fn,
                    'width': 1024,
                    'height': 576,
                    'raw': sent,
                    'sent': sent,
                    'tokens': self.tokenizer.tokenize(sent),
                    #'category_id': ref['category_id'],
                    'gt_box': [gt_x, gt_y, gt_x + gt_w, gt_y + gt_h] if not self.test_mode else None
                }
                self.refer.ref_id_to_box[refer_id] = [image_name, [gt_x, gt_y, gt_w, gt_h], sent]
                database.append(idb)
                refer_id += 1

        with open('./final_refer_testset', 'w') as f:
            json.dump(self.refer.ref_id_to_box, f)

        print('Done (t={:.2f}s)'.format(time.time() - tic))

        # cache database via cPickle
        if self.cache_db:
            print('caching database to {}...'.format(db_cache_path))
            tic = time.time()
            if not os.path.exists(db_cache_root):
                makedirsExist(db_cache_root)
            with open(db_cache_path, 'wb') as f:
                cPickle.dump(database, f)
            print('Done (t={:.2f}s)'.format(time.time() - tic))

        return database

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #8
0
class VCRDataset(Dataset):
    def __init__(self, ann_file, image_set, root_path, data_path, transform=None, task='Q2A', test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 basic_tokenizer=None, tokenizer=None, pretrained_model_name=None,
                 only_use_relevant_dets=False, add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, basic_align=False, qa2r_noq=False, qa2r_aug=False,
                 seq_len=64,
                 **kwargs):
        """
        Visual Commonsense Reasoning Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param task: 'Q2A' means question to answer, 'QA2R' means question and answer to rationale,
                     'Q2AR' means question to answer and rationale
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param only_use_relevant_dets: filter out detections not used in query and response
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param basic_align: align to tokens retokenized by basic_tokenizer
        :param qa2r_noq: in QA->R, the query contains only the correct answer, without question
        :param qa2r_aug: in QA->R, whether to augment choices to include those with wrong answer in query
        :param kwargs:
        """
        super(VCRDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert task in ['Q2A', 'QA2R', 'Q2AR'] , 'not support task {}'.format(task)
        assert not qa2r_aug, "Not implemented!"

        self.qa2r_noq = qa2r_noq
        self.qa2r_aug = qa2r_aug

        self.seq_len = seq_len

        categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
                      'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse',
                      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
                      'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove',
                      'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon',
                      'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut',
                      'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse',
                      'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
                      'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush']
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, ann_file)
        self.image_set = image_set
        self.transform = transform
        self.task = task
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.basic_align = basic_align
        print('Dataset Basic Align: {}'.format(self.basic_align))
        self.cache_dir = os.path.join(root_path, 'cache')
        self.only_use_relevant_dets = only_use_relevant_dets
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \
            else BasicTokenizer(do_lower_case=True)
        if tokenizer is None:
            if pretrained_model_name is None:
                pretrained_model_name = 'bert-base-uncased'
            if 'roberta' in pretrained_model_name:
                tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
            else:
                tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
        self.tokenizer = tokenizer

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations(self.ann_file)
        if self.aspect_grouping:
            assert False, "Not support aspect grouping now!"
            self.group_ids = self.group_aspect(self.database)

        self.person_name_id = 0

    def load_annotations(self, ann_file):
        tic = time.time()
        database = []
        db_cache_name = 'vcr_nometa_{}_{}_{}'.format(self.task, self.image_set, os.path.basename(ann_file)[:-len('.jsonl')])
        if self.only_use_relevant_dets:
            db_cache_name = db_cache_name + '_only_relevant_dets'
        if self.zip_mode:
            db_cache_name = db_cache_name + '_zipped'
        db_cache_root = os.path.join(self.root_path, 'cache')
        db_cache_path = os.path.join(db_cache_root, '{}.pkl'.format(db_cache_name))

        if os.path.exists(db_cache_path):
            if not self.ignore_db_cache:
                # reading cached database
                print('cached database found in {}.'.format(db_cache_path))
                with open(db_cache_path, 'rb') as f:
                    print('loading cached database from {}...'.format(db_cache_path))
                    tic = time.time()
                    database = cPickle.load(f)
                    print('Done (t={:.2f}s)'.format(time.time() - tic))
                    return database
            else:
                print('cached database ignored.')

        # ignore or not find cached database, reload it from annotation file
        print('loading database from {}...'.format(ann_file))
        tic = time.time()

        with jsonlines.open(ann_file) as reader:
            for ann in reader:
                if self.zip_mode:
                    img_fn = os.path.join(self.data_path, self.image_set + '.zip@/' + self.image_set, ann['img_fn'])
                    metadata_fn = os.path.join(self.data_path, self.image_set + '.zip@/' + self.image_set, ann['metadata_fn'])
                else:
                    img_fn = os.path.join(self.data_path, self.image_set, ann['img_fn'])
                    metadata_fn = os.path.join(self.data_path, self.image_set, ann['metadata_fn'])

                db_i = {
                    'annot_id': ann['annot_id'],
                    'objects': ann['objects'],
                    'img_fn': img_fn,
                    'metadata_fn': metadata_fn,
                    'question': ann['question'],
                    'answer_choices': ann['answer_choices'],
                    'answer_label': ann['answer_label'] if not self.test_mode else None,
                    'rationale_choices': ann['rationale_choices'],
                    'rationale_label': ann['rationale_label'] if not self.test_mode else None,
                }
                database.append(db_i)
        print('Done (t={:.2f}s)'.format(time.time() - tic))

        # cache database via cPickle
        if self.cache_db:
            print('caching database to {}...'.format(db_cache_path))
            tic = time.time()
            if not os.path.exists(db_cache_root):
                makedirsExist(db_cache_root)
            with open(db_cache_path, 'wb') as f:
                cPickle.dump(database, f)
            print('Done (t={:.2f}s)'.format(time.time() - tic))

        return database

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def retokenize_and_convert_to_ids_with_tag(self, tokens, objects_replace_name, non_obj_tag=-1):
        parsed_tokens = []
        tags = []
        align_ids = []
        raw = []
        align_id = 0
        for mixed_token in tokens:
            if isinstance(mixed_token, list):
                tokens = [objects_replace_name[o] for o in mixed_token]
                retokenized_tokens = self.tokenizer.tokenize(tokens[0])
                raw.append(tokens[0])
                tags.extend([mixed_token[0] + non_obj_tag + 1 for _ in retokenized_tokens])
                align_ids.extend([align_id for _ in retokenized_tokens])
                align_id += 1
                for token, o in zip(tokens[1:], mixed_token[1:]):
                    retokenized_tokens.append('and')
                    tags.append(non_obj_tag)
                    align_ids.append(align_id)
                    align_id += 1
                    re_tokens = self.tokenizer.tokenize(token)
                    retokenized_tokens.extend(re_tokens)
                    tags.extend([o + non_obj_tag + 1 for _ in re_tokens])
                    align_ids.extend([align_id for _ in re_tokens])
                    align_id += 1
                    raw.extend(['and', token])
                parsed_tokens.extend(retokenized_tokens)
            else:
                if self.basic_align:
                    # basic align
                    basic_tokens = self.basic_tokenizer.tokenize(mixed_token)
                    raw.extend(basic_tokens)
                    for t in basic_tokens:
                        retokenized_tokens = self.tokenizer.tokenize(t)
                        parsed_tokens.extend(retokenized_tokens)
                        align_ids.extend([align_id for _ in retokenized_tokens])
                        tags.extend([non_obj_tag for _ in retokenized_tokens])
                        align_id += 1
                else:
                    # fully align to original tokens
                    raw.append(mixed_token)
                    retokenized_tokens = self.tokenizer.tokenize(mixed_token)
                    parsed_tokens.extend(retokenized_tokens)
                    align_ids.extend([align_id for _ in retokenized_tokens])
                    tags.extend([non_obj_tag for _ in retokenized_tokens])
                    align_id += 1
        ids = self.tokenizer.convert_tokens_to_ids(parsed_tokens)
        ids_with_tag = list(zip(ids, tags, align_ids))
        
        return ids_with_tag, raw

    @staticmethod
    def keep_only_relevant_dets(question, answer_choices, rationale_choices):
        dets_to_use = []
        for i, tok in enumerate(question):
            if isinstance(tok, list):
                for j, o in enumerate(tok):
                    if o not in dets_to_use:
                        dets_to_use.append(o)
                    question[i][j] = dets_to_use.index(o)
        if answer_choices is not None:
            for n, answer in enumerate(answer_choices):
                for i, tok in enumerate(answer):
                    if isinstance(tok, list):
                        for j, o in enumerate(tok):
                            if o not in dets_to_use:
                                dets_to_use.append(o)
                            answer_choices[n][i][j] = dets_to_use.index(o)
        if rationale_choices is not None:
            for n, rationale in enumerate(rationale_choices):
                for i, tok in enumerate(rationale):
                    if isinstance(tok, list):
                        for j, o in enumerate(tok):
                            if o not in dets_to_use:
                                dets_to_use.append(o)
                            rationale_choices[n][i][j] = dets_to_use.index(o)

        return dets_to_use, question, answer_choices, rationale_choices

    def __getitem__(self, index):
        # self.person_name_id = 0
        idb = deepcopy(self.database[index])

        metadata = self._load_json(idb['metadata_fn'])
        idb['boxes'] = metadata['boxes']
        idb['segms'] = metadata['segms']
        # idb['width'] = metadata['width']
        # idb['height'] = metadata['height']
        if self.only_use_relevant_dets:
            dets_to_use, idb['question'], idb['answer_choices'], idb['rationale_choices'] = \
                self.keep_only_relevant_dets(idb['question'],
                                             idb['answer_choices'],
                                             idb['rationale_choices'] if not self.task == 'Q2A' else None)
            idb['objects'] = [idb['objects'][i] for i in dets_to_use]
            idb['boxes'] = [idb['boxes'][i] for i in dets_to_use]
            idb['segms'] = [idb['segms'][i] for i in dets_to_use]
        objects_replace_name = []
        for o in idb['objects']:
            if o == 'person':
                objects_replace_name.append(GENDER_NEUTRAL_NAMES[self.person_name_id])
                self.person_name_id = (self.person_name_id + 1) % len(GENDER_NEUTRAL_NAMES)
            else:
                objects_replace_name.append(o)

        non_obj_tag = 0 if self.add_image_as_a_box else -1
        idb['question'] = self.retokenize_and_convert_to_ids_with_tag(idb['question'],
                                                                      objects_replace_name=objects_replace_name,
                                                                      non_obj_tag=non_obj_tag)

        idb['answer_choices'] = [self.retokenize_and_convert_to_ids_with_tag(answer,
                                                                             objects_replace_name=objects_replace_name,
                                                                             non_obj_tag=non_obj_tag)
                                 for answer in idb['answer_choices']]

        idb['rationale_choices'] = [self.retokenize_and_convert_to_ids_with_tag(rationale,
                                                                                objects_replace_name=objects_replace_name,
                                                                                non_obj_tag=non_obj_tag)
                                    for rationale in idb['rationale_choices']] if not self.task == 'Q2A' else None

        # truncate text to seq_len
        if self.task == 'Q2A':
            q = idb['question'][0]
            for a, a_raw in idb['answer_choices']:
                while len(q) + len(a) > self.seq_len:
                    if len(a) > len(q):
                        a.pop()
                    else:
                        q.pop()
        elif self.task == 'QA2R':
            if not self.test_mode:
                q = idb['question'][0]
                a = idb['answer_choices'][idb['answer_label']][0]
                for r, r_raw in idb['rationale_choices']:
                    while len(q) + len(a) + len(r) > self.seq_len:
                        if len(r) > (len(q) + len(a)):
                            r.pop()
                        elif len(q) > 1:
                            q.pop()
                        else:
                            a.pop()
        else:
            raise NotImplemented

        image = self._load_image(idb['img_fn'])
        w0, h0 = image.size
        objects = idb['objects']

        # extract bounding boxes and instance masks in metadata
        boxes = torch.zeros((len(objects), 6))
        masks = torch.zeros((len(objects), *self.mask_size))
        if len(objects) > 0:
            boxes[:, :5] = torch.tensor(idb['boxes'])
            boxes[:, 5] = torch.tensor([self.category_to_idx[obj] for obj in objects])
            for i in range(len(objects)):
                seg_polys = [torch.as_tensor(seg) for seg in idb['segms'][i]]
                masks[i] = generate_instance_mask(seg_polys, idb['boxes'][i], mask_size=self.mask_size,
                                                  dtype=torch.float32, copy=False)
        if self.add_image_as_a_box:
            image_box = torch.as_tensor([[0, 0, w0 - 1, h0 - 1, 1.0, 0]])
            image_mask = torch.ones((1, *self.mask_size))
            boxes = torch.cat((image_box, boxes), dim=0)
            masks = torch.cat((image_mask, masks), dim=0)

        question, question_raw = idb['question']
        question_align_matrix = get_align_matrix([w[2] for w in question])
        answer_choices, answer_choices_raw = zip(*idb['answer_choices'])
        answer_choices = list(answer_choices)
        answer_align_matrix = [get_align_matrix([w[2] for w in a]) for a in answer_choices]
        answer_label = torch.as_tensor(idb['answer_label']) if not self.test_mode else None
        if not self.task == 'Q2A':
            rationale_choices = [r[0] for r in idb['rationale_choices']]
            rationale_align_matrix = [get_align_matrix([w[2] for w in r]) for r in rationale_choices]
            rationale_label = torch.as_tensor(idb['rationale_label']) if not self.test_mode else None

        # transform
        im_info = torch.tensor([w0, h0, 1.0, 1.0, index])
        if self.transform is not None:
            image, boxes, masks, im_info = self.transform(image, boxes, masks, im_info)

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)

        if self.task == 'Q2AR':
            if not self.test_mode:
                outputs = (image, boxes, masks,
                           question, question_align_matrix,
                           answer_choices, answer_align_matrix, answer_label,
                           rationale_choices, rationale_align_matrix, rationale_label,
                           im_info)
            else:
                outputs = (image, boxes, masks,
                           question, question_align_matrix,
                           answer_choices, answer_align_matrix,
                           rationale_choices, rationale_align_matrix,
                           im_info)
        elif self.task == 'Q2A':
            if not self.test_mode:
                outputs = (image, boxes, masks,
                           question, question_align_matrix,
                           answer_choices, answer_align_matrix, answer_label,
                           im_info)
            else:
                outputs = (image, boxes, masks,
                           question, question_align_matrix,
                           answer_choices, answer_align_matrix,
                           im_info)
        elif self.task == 'QA2R':
            if not self.test_mode:
                outputs = (image, boxes, masks,
                           ([] if self.qa2r_noq else question) + answer_choices[answer_label],
                           answer_align_matrix[answer_label] if self.qa2r_noq else block_digonal_matrix(question_align_matrix, answer_align_matrix[answer_label]),
                           rationale_choices, rationale_align_matrix, rationale_label,
                           im_info)
            else:
                outputs = (image, boxes, masks,
                           [([] if self.qa2r_noq else question) + a for a in answer_choices],
                           [m if self.qa2r_noq else block_digonal_matrix(question_align_matrix, m)
                            for m in answer_align_matrix],
                           rationale_choices, rationale_align_matrix,
                           im_info)

        return outputs

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path)
        else:
            return Image.open(path)

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)

    @property
    def data_names(self):
        if not self.test_mode:
            if self.task == 'Q2A':
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'answer_choices', 'answer_align_matrix', 'answer_label',
                              'im_info']
            elif self.task == 'QA2R':
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'rationale_choices', 'rationale_align_matrix', 'rationale_label',
                              'im_info']
            else:
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'answer_choices', 'answer_align_matrix', 'answer_label',
                              'rationale_choices', 'rationale_align_matrix', 'rationale_label',
                              'im_info']
        else:
            if self.task == 'Q2A':
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'answer_choices', 'answer_align_matrix',
                              'im_info']
            elif self.task == 'QA2R':
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'rationale_choices', 'rationale_align_matrix',
                              'im_info']
            else:
                data_names = ['image', 'boxes', 'masks',
                              'question', 'question_align_matrix',
                              'answer_choices', 'answer_align_matrix',
                              'rationale_choices', 'rationale_align_matrix',
                              'im_info']

        return data_names
예제 #9
0
class Distance_Translation_Multi30kDataset(Dataset):
    def __init__(self,
                 ann_file,
                 image_set,
                 root_path,
                 data_path,
                 seq_len=64,
                 with_precomputed_visual_feat=False,
                 mask_raw_pixels=True,
                 with_rel_task=True,
                 with_mlm_task=False,
                 with_mvrc_task=False,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False,
                 languages_used='first',
                 **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Distance_Translation_Multi30kDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        # TODO: need to remove this to allows testing
        # assert not test_mode

        annot = {
            'train': 'train.json',
            'val': 'val.json',
            'test2015': 'test.json'
        }

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        #FM edit: added option for how many captions
        self.languages_used = languages_used
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset - only used for inference
        self.database = list(jsonlines.open(self.ann_file))
        # if not self.test_mode:
        #     self.database = list(jsonlines.open(self.ann_file))
        # # FM edit: create dataset for test mode
        # else:
        #     self.simple_database = list(jsonlines.open(self.ann_file))
        #     # create database cross-coupling each caption_en with all captions_de
        #     self.database = []
        #     db_index = 0
        #     for x, idb_x in enumerate(self.simple_database):
        #         for y, idb_y in enumerate(self.simple_database):
        #             self.database.append({})
        #             self.database[db_index]['label'] = 1.0 if x==y else 0.0
        #             self.database[db_index]['caption_en'] = self.simple_database[x]['caption_en']
        #             self.database[db_index]['caption_de'] = self.simple_database[y]['caption_de']
        #             self.database[db_index]['caption_en_index'] = x
        #             self.database[db_index]['caption_de_index'] = y
        #             db_index += 1

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)

    @property
    def data_names(self):
        return ['text', 'relationship_label', 'mlm_labels']

    def __getitem__(self, index):
        idb = self.database[index]

        # # indeces for inference
        # caption_en_index = idb['caption_en_index'] if self.test_mode else 0
        # caption_de_index = idb['caption_de_index'] if self.test_mode else 0

        # Task #1: Caption-Image Relationship Prediction
        _p = random.random()
        if not self.test_mode:
            if _p < 0.5:
                relationship_label = 1.0
                caption_en = idb['caption_en']
                caption_de = idb['caption_de']
            else:
                relationship_label = 0.0
                rand_index = random.randrange(0, len(self.database))
                while rand_index == index:
                    rand_index = random.randrange(0, len(self.database))
                # caption_en and image match, german caption is random
                caption_en = idb['caption_en']
                caption_de = self.database[rand_index]['caption_de']
        # for inference
        else:
            relationship_label = 1
            caption_en = idb['caption_en']
            caption_de = idb['caption_de']

        # FM edit: add captions
        caption_tokens_en = self.tokenizer.tokenize(caption_en)
        caption_tokens_de = self.tokenizer.tokenize(caption_de)
        mlm_labels_en = [-1] * len(caption_tokens_en)
        mlm_labels_de = [-1] * len(caption_tokens_de)

        # FM edit: captions of both languages exist in all cases
        if self.languages_used == 'first':
            text_tokens = ['[CLS]'] + caption_tokens_en + ['[SEP]']
            mlm_labels = [-1] + mlm_labels_en + [-1]
        elif self.languages_used == 'second':
            text_tokens = ['[CLS]'] + caption_tokens_de + ['[SEP]']
            mlm_labels = [-1] + mlm_labels_de + [-1]
        else:
            text_tokens = ['[CLS]'] + caption_tokens_en + [
                '[SEP]'
            ] + caption_tokens_de + ['[SEP]']
            mlm_labels = [-1] + mlm_labels_en + [-1] + mlm_labels_de + [-1]

        # convert tokens to ids
        text = self.tokenizer.convert_tokens_to_ids(text_tokens)

        # truncate seq to max len
        if len(text) > self.seq_len:
            text_len_keep = len(text)
            while (text_len_keep) > self.seq_len and (text_len_keep > 0):
                text_len_keep -= 1
            if text_len_keep < 2:
                text_len_keep = 2
            text = text[:(text_len_keep - 1)] + [text[-1]]

        return text, relationship_label, mlm_labels

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)
예제 #10
0
class Foil(Dataset):
    def __init__(self,
                 root_path,
                 data_path,
                 boxes='gt',
                 proposal_source='official',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        Foil Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Foil, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        coco_annot_files = {
            "train2014": "annotations/instances_train2014.json",
            "val2014": "annotations/instances_val2014.json",
            "test2015": "annotations/image_info_test2015.json",
        }

        foil_annot_files = {
            "train": "foil/foilv1.0_train_2017.json",
            "test": "foil/foilv1.0_test_2017.json"
        }

        foil_vocab_file = "foil/vocab.txt"

        self.vg_proposal = ("vgbua_res101_precomputed",
                            "trainval2014_resnet101_faster_rcnn_genome")

        self.test_mode = test_mode
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform

        vocab_file = open(os.path.join(data_path, foil_vocab_file), 'r')
        vocab_lines = vocab_file.readlines()
        vocab_lines = [v.strip() for v in vocab_lines]
        self.itos = vocab_lines
        self.stoi = dict(list(zip(self.itos, range(len(vocab_lines)))))

        if self.test_mode:
            self.image_set = "val2014"
            coco_annot_file = coco_annot_files["val2014"]
        else:
            self.image_set = "train2014"
            coco_annot_file = coco_annot_files["train2014"]

        self.coco = COCO(
            annotation_file=os.path.join(data_path, coco_annot_file))
        self.foil = FOIL(data_path, 'train' if not test_mode else 'test')
        self.foil_ids = list(self.foil.Foils.keys())
        self.foils = self.foil.loadFoils(foil_ids=self.foil_ids)
        if 'proposal' in boxes:
            with open(os.path.join(data_path, proposal_dets), 'r') as f:
                proposal_list = json.load(f)
            self.proposals = {}
            for proposal in proposal_list:
                image_id = proposal['image_id']
                if image_id in self.proposals:
                    self.proposals[image_id].append(proposal['box'])
                else:
                    self.proposals[image_id] = [proposal['box']]
        self.boxes = boxes
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)

    @property
    def data_names(self):
        return [
            'image', 'boxes', 'im_info', 'expression', 'label', 'pos',
            'target', 'mask'
        ]

    def __getitem__(self, index):
        idb = self.database[index]

        # image related
        img_id = idb['image_id']
        image = self._load_image(idb['image_fn'])
        im_info = torch.as_tensor([idb['width'], idb['height'], 1.0, 1.0])
        #if not self.test_mode:
        #    gt_box = torch.as_tensor(idb['gt_box'])
        flipped = False
        if self.boxes == 'gt':
            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            anns = self.coco.loadAnns(ann_ids)
            boxes = []
            for ann in anns:
                x_, y_, w_, h_ = ann['bbox']
                boxes.append([x_, y_, x_ + w_, y_ + h_])
            boxes = torch.as_tensor(boxes)
        elif self.boxes == 'proposal':
            if self.proposal_source == 'official':
                boxes = torch.as_tensor(self.proposals[img_id])
                boxes[:, [2, 3]] += boxes[:, [0, 1]]
            elif self.proposal_source == 'vg':
                box_file = os.path.join(
                    self.data_path, self.vg_proposal[0],
                    '{0}.zip@/{0}'.format(self.vg_proposal[1]))
                boxes_fn = os.path.join(box_file,
                                        '{}.json'.format(idb['image_id']))
                boxes_data = self._load_json(boxes_fn)
                boxes = torch.as_tensor(
                    np.frombuffer(self.b64_decode(boxes_data['boxes']),
                                  dtype=np.float32).reshape(
                                      (boxes_data['num_boxes'], -1)))
            else:
                raise NotImplemented
        elif self.boxes == 'proposal+gt' or self.boxes == 'gt+proposal':
            if self.proposal_source == 'official':
                boxes = torch.as_tensor(self.proposals[img_id])
                boxes[:, [2, 3]] += boxes[:, [0, 1]]
            elif self.proposal_source == 'vg':
                box_file = os.path.join(
                    self.data_path, self.vg_proposal[0],
                    '{0}.zip@/{0}'.format(self.vg_proposal[1]))
                boxes_fn = os.path.join(box_file,
                                        '{}.json'.format(idb['image_id']))
                boxes_data = self._load_json(boxes_fn)
                boxes = torch.as_tensor(
                    np.frombuffer(self.b64_decode(boxes_data['boxes']),
                                  dtype=np.float32).reshape(
                                      (boxes_data['num_boxes'], -1)))
            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            anns = self.coco.loadAnns(ann_ids)
            gt_boxes = []
            for ann in anns:
                x_, y_, w_, h_ = ann['bbox']
                gt_boxes.append([x_, y_, x_ + w_, y_ + h_])
            gt_boxes = torch.as_tensor(gt_boxes)
            boxes = torch.cat((boxes, gt_boxes), 0)
        else:
            raise NotImplemented

        if self.add_image_as_a_box:
            w0, h0 = im_info[0], im_info[1]
            image_box = torch.as_tensor([[0.0, 0.0, w0 - 1, h0 - 1]])
            boxes = torch.cat((image_box, boxes), dim=0)

        if self.transform is not None:
            image, boxes, _, im_info, flipped = self.transform(
                image, boxes, None, im_info, flipped)

        # clamp boxes
        w = im_info[0].item()
        h = im_info[1].item()
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(min=0, max=w - 1)
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(min=0, max=h - 1)

        # assign label expression with the foil annotation
        label = idb['label']
        foil_pos = idb['pos']

        # expression
        exp = idb['caption_tokens']
        exp_ids = self.tokenizer.convert_tokens_to_ids(exp)

        target = self.stoi[idb['target_word']]
        mask = idb['mask']
        if self.test_mode:
            return image, boxes, im_info, exp_ids, label, foil_pos, target, mask
        else:
            return image, boxes, im_info, exp_ids, label, foil_pos, target, mask

    @staticmethod
    def b64_decode(string):
        return base64.decodebytes(string.encode())

    def load_annotations(self):
        tic = time.time()
        database = []
        db_cache_name = 'foil_{}'.format(self.image_set)
        if self.zip_mode:
            db_cache_name = db_cache_name + '_zipmode'
        if self.test_mode:
            db_cache_name = db_cache_name + '_testmode'
        db_cache_root = os.path.join(self.root_path, 'cache')
        db_cache_path = os.path.join(db_cache_root,
                                     '{}.pkl'.format(db_cache_name))

        if os.path.exists(db_cache_path):
            if not self.ignore_db_cache:
                # reading cached database
                print('cached database found in {}.'.format(db_cache_path))
                with open(db_cache_path, 'rb') as f:
                    print('loading cached database from {}...'.format(
                        db_cache_path))
                    tic = time.time()
                    database = cPickle.load(f)
                    print('Done (t={:.2f}s)'.format(time.time() - tic))
                    return database
            else:
                print('cached database ignored.')

        # ignore or not find cached database, reload it from annotation file
        print('loading database of split {}...'.format(self.image_set))
        tic = time.time()

        for foil_id, foil in zip(self.foil_ids, self.foils):
            iset = 'train2014'
            if self.zip_mode:
                image_fn = os.path.join(
                    self.data_path, iset + '.zip@/' + iset,
                    'COCO_{}_{:012d}.jpg'.format(iset, foil['image_id']))
            else:
                image_fn = os.path.join(
                    self.root_path, self.data_path, iset,
                    'COCO_{}_{:012d}.jpg'.format(iset, foil['image_id']))

            expression_tokens = self.tokenizer.basic_tokenizer.tokenize(
                foil['caption'])
            expression_wps = []
            for token in expression_tokens:
                expression_wps.extend(
                    self.tokenizer.wordpiece_tokenizer.tokenize(token))

            word_offsets = [0]

            for i, wp in enumerate(expression_wps):
                if wp[0] == '#':
                    #still inside single word
                    continue
                else:
                    #this is the beginning of a new word
                    word_offsets.append(i)

            word_offsets.append(len(expression_wps))

            target_word = foil['target_word']
            foil_word = foil['foil_word']
            target_wps = None
            target_pos = -1
            if foil['foil']:
                foil_wps = self.tokenizer.wordpiece_tokenizer.tokenize(
                    foil_word)
                twps_len = len(foil_wps)
                for i in range(len(expression_wps) - twps_len):
                    if expression_wps[i:i + twps_len] == foil_wps:
                        target_pos = i
                        break
            else:
                twps_len = 1
            idb = {
                'ann_id': foil['id'],
                'foil_id': foil['foil_id'],
                'image_id': foil['image_id'],
                'image_fn': image_fn,
                'width': self.coco.imgs[foil['image_id']]['width'],
                'height': self.coco.imgs[foil['image_id']]['height'],
                'caption': foil['caption'].strip(),
                'caption_tokens': expression_wps,
                'target_word': foil['target_word'],
                'target': self.stoi.get(foil['target_word'], 0),
                'foil_word': foil['foil_word'],
                'label': foil['foil'],
                'pos': target_pos,
                'mask': twps_len
            }
            database.append(idb)

        print('Done (t={:.2f}s)'.format(time.time() - tic))

        # cache database via cPickle
        if self.cache_db:
            print('caching database to {}...'.format(db_cache_path))
            tic = time.time()
            if not os.path.exists(db_cache_root):
                makedirsExist(db_cache_root)
            with open(db_cache_path, 'wb') as f:
                cPickle.dump(database, f)
            print('Done (t={:.2f}s)'.format(time.time() - tic))

        return database

    @staticmethod
    def group_aspect(database):
        print('grouping aspect...')
        t = time.time()

        # get shape of all images
        widths = torch.as_tensor([idb['width'] for idb in database])
        heights = torch.as_tensor([idb['height'] for idb in database])

        # group
        group_ids = torch.zeros(len(database))
        horz = widths >= heights
        vert = 1 - horz
        group_ids[horz] = 0
        group_ids[vert] = 1

        print('Done (t={:.2f}s)'.format(time.time() - t))

        return group_ids

    def __len__(self):
        return len(self.database)

    def _load_image(self, path):
        if '.zip@' in path:
            return self.zipreader.imread(path).convert('RGB')
        else:
            return Image.open(path).convert('RGB')

    def _load_json(self, path):
        if '.zip@' in path:
            f = self.zipreader.read(path)
            return json.loads(f.decode())
        else:
            with open(path, 'r') as f:
                return json.load(f)