Beispiel #1
0
    def __getitem__(self, index):

        if self.complete_shuffle:
            if self.pretraining_include_qa_and_qar:
                index = index // 8
                which = index % 8
            else:
                index = index // 4
                which = index % 4
        else:
            which = None

        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers

        answer_choices = item['{}_choices'.format(self.mode)]

        if self.complete_shuffle and which < 4:
            only_use_answer = True
        else:
            only_use_answer = False

        if self.complete_shuffle and which >= 4:
            only_use_qar = True
        else:
            only_use_qar = False

        dets2use, old_det_to_new_ind = self._get_dets_to_use(
            item, only_use_answer=only_use_answer, only_use_qar=only_use_qar)

        # The only_use_qar is ambigious...

        instance_dict = {}
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(self.vcr_image_dir, item['img_fn']))
        #image = self.imagedatas(item['img_fn'])

        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(self.vcr_image_dir, item['metadata_fn']),
                  'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        examples = data_iter_item(
            item,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            endingonly=False,
            include_qar=self.pretraining_include_qa_and_qar,
            only_qar=self.only_qar)
        self.getitem_bert_part(examples, item, instance_dict, which)

        if self.use_alignment:  # Alignment between objects and text
            ######################
            examples_alginment_pack = []
            for i in range(len(examples)):
                if self.pretraining_include_qa_and_qar:
                    if i < 4:
                        raw_text_a = item["question"]
                        raw_text_b = item['answer_choices'][i]
                    else:
                        raw_text_a = item["question"] + item['answer_choices'][
                            item['answer_label']]
                        raw_text_b = item['rationale_choices'][i - 4]
                elif self.only_qar:
                    raw_text_a = item["question"] + item['answer_choices'][item[
                        'answer_label']]  # This is the correct alignment right now.
                    raw_text_b = item['rationale_choices'][i]
                else:
                    raw_text_a = item["question"]
                    raw_text_b = item['answer_choices'][i]

                true_text_a = examples[i][0].text_a
                true_text_b = examples[i][0].text_b
                text_alignment_a = examples[i][1]
                text_alignment_b = examples[i][2]

                examples_alginment_pack.append(
                    (raw_text_a, raw_text_b, true_text_a, true_text_b,
                     text_alignment_a, text_alignment_b))

            image_box_position = []

            if which is not None:
                raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[
                    which]
                box_record = defaultdict(list)
                self.get_alignment_original(raw_text_a,
                                            text_alignment_a,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1)
                self.get_alignment_original(raw_text_b,
                                            text_alignment_b,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1 + len(text_alignment_a) +
                                            1)
                image_text_alignment = ListField([
                    IntArrayField(np.array(box_record[i]), padding_value=-1)
                    for i in range(len(boxes))
                ])
            else:
                for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack:

                    box_record = defaultdict(list)
                    self.get_alignment_original(raw_text_a,
                                                text_alignment_a,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1)
                    self.get_alignment_original(raw_text_b,
                                                text_alignment_b,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1 +
                                                len(text_alignment_a) + 1)

                    image_box_position.append(
                        ListField([
                            IntArrayField(np.array(box_record[i]),
                                          padding_value=-1)
                            for i in range(len(boxes))
                        ]))

                image_text_alignment = ListField(image_box_position)
            ######################

            instance_dict["image_text_alignment"] = image_text_alignment

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
Beispiel #2
0
    def __getitem__(self, index):
        event_inference_example = torch.tensor(self.examples[index])
        labels = torch.tensor(self.labels[index])
        record = self.records[index]
        if not self.include_image:
            return event_inference_example, labels

        #######
        # Compute Image Features. Adapted from https://github.com/rowanz/r2c/blob/master/dataloaders/vcg.py
        #######

        ###################################################################
        # Load boxes and their features.
        with open(os.path.join(VCR_IMAGES_DIR, record['metadata_fn']), 'r') as f:
            metadata = json.load(f)
        dets2use, old_det_to_new_ind, subjects = self.get_dets_to_use(record)
        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i],
                                    polygons_list=metadata['segms'][i])
                          for i in dets2use])

        # Chop off the final dimension, that's the confidence
        img_fn = record['img_fn']
        id = img_fn[img_fn.rfind('/')+1:img_fn.rfind('.')]
        with open(os.path.join(VCR_FEATURES_DIR,id)+'.pkl','rb') as p:
            features_dict = pickle.load(p)
        features = features_dict['object_features'][dets2use]
        boxes = np.array(metadata['boxes'])[dets2use, :-1]

        # create id labels to help ground person in the image
        objects = metadata['names']
        obj_labels = [self.coco_obj_to_ind[objects[i]] for i in
                      dets2use.tolist()]
        person_ids = [0] * len(obj_labels)
        for i in range(len(person_ids)):
            if obj_labels[i] == 1:
                p_id = int(dets2use[i])+1  # add 1 for person ids because it starts with 1
                person_ids[i] = self.tokenizer.convert_tokens_to_ids(['<|det%d|>' % p_id])[0]
        subject_ids = [int(dets2use[i] in subjects) for i in range(len(obj_labels))]

        # add the image in the first visual sequence
        if self.add_image_as_a_box:
            w = metadata['width']
            h = metadata['height']
            features = np.row_stack((features_dict['image_features'], features))
            boxes = np.row_stack((np.array([0, 0, w, h]), boxes))
            segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels
            person_ids = [self.tokenizer.convert_tokens_to_ids(['<|det0|>'])[0]] + person_ids
            subject_ids = [0] + subject_ids

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        if not np.all((boxes[:, 2] <= w)):
            boxes[:,2] = np.clip(boxes[:,2],None,w)
        if not np.all((boxes[:, 3] <= h)):
            boxes[:, 3] = np.clip(boxes[:, 3], None, h)

        padded_features, padded_boxes, padded_obj_labels, padded_segments, box_masks = \
            _to_boxes_and_masks(features, boxes, obj_labels, segms, self.num_max_boxes)
        person_ids = _pad_ids(person_ids, self.num_max_boxes)
        subject_ids = _pad_ids(subject_ids, self.num_max_boxes)

        features = torch.Tensor(padded_features)
        boxes = torch.Tensor(padded_boxes)
        boxes_mask = torch.LongTensor(box_masks)
        objects = torch.LongTensor(padded_obj_labels)
        segments = torch.Tensor(padded_segments)
        person_ids = torch.LongTensor(person_ids)
        subject_ids = torch.LongTensor(subject_ids)

        return event_inference_example, labels, features, boxes, boxes_mask, objects, segments, person_ids, subject_ids
Beispiel #3
0
    def __getitem_detector__(self, index):
        item = self.items[index]
        sample = {}
        if self.expanded and index >= self.train_size:
            image_file_name = "COCO_val2014_{:0>12d}.jpg".format(
                item['image_id'])
        else:
            image_file_name = "COCO_{}2014_{:0>12d}.jpg".format(
                self.split_name, item['image_id'])

        image_info = self.masks[image_file_name]
        if "train" in image_file_name:
            image_file_path = os.path.join(self.data_root, "train2014",
                                           image_file_name)
        elif "val" in image_file_name:
            image_file_path = os.path.join(self.data_root, "val2014",
                                           image_file_name)

        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        metadata = self.masks[image_file_name]  # Get the metadata
        # Load boxes.
        # We will use all detections
        dets2use = np.arange(len(metadata['boxes']))
        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]

        try:
            metadata['names'] = [
                i.split(" ")[1][1:-1] for i in metadata["names"]
            ]
        except:
            pass
        obj_labels = [
            self.coco_obj_to_ind[metadata['names'][i]]
            for i in dets2use.tolist()
        ]

        boxes = np.row_stack((window, boxes))
        segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms),
                               0)
        obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        sample['segms'] = ArrayField(segms, padding_value=0)
        sample['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        sample['boxes'] = ArrayField(boxes, padding_value=-1)

        caption_a = item["caption"]
        imageID = item["image_id"]

        sample["label"] = sample[
            'objects']  # This is an useless field. Just so that they know the batch size.

        if self.expanded and index >= self.train_size:
            coco = self.coco_val
        else:
            coco = self.coco

        rest_anns = coco.loadAnns(
            [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']])

        if self.args.get("two_sentence", True):
            if random.random() > 0.5:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = rest_anns[random.randint(0, len(rest_anns) - 1)]
                flag = True  # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_a,
                                        text_b=subword_tokens_b,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        elif not self.args.get("no_next_sentence", False):
            if random.random() < self.args.false_caption_ratio:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = item
                flag = True  # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_b,
                                        text_b=None,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        else:
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_a,
                                        text_b=None,
                                        is_correct=None,
                                        max_seq_length=self.max_seq_length)

        bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
            example=bert_example,
            tokenizer=self.tokenizer,
            probability=self.masked_lm_prob)
        bert_feature.insert_field_into_dict(sample)

        return image, Instance(sample)
Beispiel #4
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(
                    item['question'],
                    grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                    old_det_to_new_ind,
                    item['objects'],
                    token_indexers=self.token_indexers,
                    pad_ind=0 if self.add_image_as_a_box else -1)
                for i in range(4)
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(
                answer,
                grp_items[f'answer_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1)
            for i, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)

        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number'],
            'img_id':
            item['img_id']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]

        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
Beispiel #5
0
    def __getitem__(self, index):
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            item['question'] += item['answer_choices'][item['answer_label']]
        elif self.mode == 'joint':
            item['joint_choices'] = [a + r for a in item['answer_choices'] \
                                            for r in item['rationale_choices']]
            if self.split != 'test':
                item['joint_label'] = item['answer_label'] * 4 + item[
                    'rationale_label']
        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        omcs_items = None
        if self.h5fn_omcs is not None:
            with h5py.File(self.h5fn_omcs, 'r') as h5_omcs:
                omcs_items = {
                    k: np.array(v, dtype=np.float16)
                    for k, v in h5_omcs[str(index)].items()
                }

        if self.all_answers_for_rationale:
            # Keys in h5 file are in format [ctx|answer]_rationale[i][j].
            # Pick i based on the answer_label set.
            assert self.mode == 'rationale'
            answer_label = item['answer_label']
            key = f'{self.mode}{answer_label}'
        else:
            # Keys are in format [ctx|answer]_mode[j]
            key = f'{self.mode}'

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            if omcs_items is None:
                ctx_embs = [
                    grp_items[f'ctx_{key}{j}']
                    for j in range(len(answer_choices))
                ]
            else:
                ctx_embs = [
                    np.hstack([
                        grp_items[f'ctx_{key}{j}'], omcs_items[f'ctx_{key}{j}']
                    ]) for j in range(len(answer_choices))
                ]
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(item['question'],
                                  ctx_embs[j],
                                  old_det_to_new_ind,
                                  item['objects'],
                                  token_indexers=self.token_indexers,
                                  pad_ind=0 if self.add_image_as_a_box else -1)
                for j in range(len(answer_choices))
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        if omcs_items is None:
            answer_embs = [
                grp_items[f'answer_{key}{j}']
                for j in range(len(answer_choices))
            ]
        else:
            answer_embs = [
                np.hstack([
                    grp_items[f'answer_{key}{j}'],
                    omcs_items[f'answer_{key}{j}']
                ]) for j in range(len(answer_choices))
            ]
        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(answer,
                              answer_embs[j],
                              old_det_to_new_ind,
                              item['objects'],
                              token_indexers=self.token_indexers,
                              pad_ind=0 if self.add_image_as_a_box else -1)
            for j, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        # instance.index_fields(self.vocab)
        return image, instance
Beispiel #6
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number']})

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image_id = self.path2id[item['img_fn']]
        image_id_gt = self.path2id_gt[item['img_fn']]
        features, num_boxes, boxes, _ = self._image_features_reader[image_id]
        boxes = boxes[:num_boxes]
        features = features[:num_boxes]
        gt_features, gt_num_boxes, gt_boxes, _ = self._gt_image_features_reader[image_id_gt]

        features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (num_boxes + gt_num_boxes)

        # merge two boxes, and assign the labels.
        gt_boxes = gt_boxes[1:gt_num_boxes]
        gt_features = gt_features[1:gt_num_boxes]
        gt_num_boxes = gt_num_boxes - 1

        gt_box_preserve = min(self._max_region_num - 1, gt_num_boxes)
        gt_boxes = gt_boxes[:gt_box_preserve]
        gt_features = gt_features[:gt_box_preserve]
        gt_num_boxes = gt_box_preserve

        num_box_preserve = min(self._max_region_num - int(gt_num_boxes), int(num_boxes))
        boxes = boxes[:num_box_preserve]
        features = features[:num_box_preserve]

        # concatenate the boxes
        mix_boxes = np.concatenate((boxes, gt_boxes), axis=0)
        mix_features = np.concatenate((features, gt_features), axis=0)
        mix_num_boxes = num_box_preserve + int(gt_num_boxes)

        image_mask = [1] * (mix_num_boxes)
        while len(image_mask) < self._max_region_num:
            image_mask.append(0)
        #
        mix_boxes_pad = np.zeros((self._max_region_num, 5))
        mix_features_pad = np.zeros((self._max_region_num, 3072))
        #
        mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes]

        # appending the target feature.
        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        # spatials = torch.tensor(mix_boxes).float()
        spatials = mix_boxes_pad


        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i])
                          for i in dets2use])

        # # Chop off the final dimension, that's the confidence
        # boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # # Possibly rescale them if necessary
        # boxes *= img_scale
        # boxes[:, :2] += np.array(padding[:2])[None]
        # boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        # if self.add_image_as_a_box:
        #     boxes = np.row_stack((window, boxes))
        #     segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
        #     obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
        #     import ipdb
        #     ipdb.set_trace()
        # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        # assert np.all((boxes[:, 2] <= w))
        # assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(spatials, padding_value=-1)
        instance_dict['box_mask'] = ArrayField(image_mask, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return features, instance