Exemple #1
0
    def __getitem__(self, index):

        if self.complete_shuffle:
            if self.pretraining_include_qa_and_qar:
                index = index // 8
                which = index % 8
            else:
                index = index // 4
                which = index % 4
        else:
            which = None

        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers

        answer_choices = item['{}_choices'.format(self.mode)]

        if self.complete_shuffle and which < 4:
            only_use_answer = True
        else:
            only_use_answer = False

        if self.complete_shuffle and which >= 4:
            only_use_qar = True
        else:
            only_use_qar = False

        dets2use, old_det_to_new_ind = self._get_dets_to_use(
            item, only_use_answer=only_use_answer, only_use_qar=only_use_qar)

        # The only_use_qar is ambigious...

        instance_dict = {}
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(self.vcr_image_dir, item['img_fn']))
        #image = self.imagedatas(item['img_fn'])

        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(self.vcr_image_dir, item['metadata_fn']),
                  'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        examples = data_iter_item(
            item,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            endingonly=False,
            include_qar=self.pretraining_include_qa_and_qar,
            only_qar=self.only_qar)
        self.getitem_bert_part(examples, item, instance_dict, which)

        if self.use_alignment:  # Alignment between objects and text
            ######################
            examples_alginment_pack = []
            for i in range(len(examples)):
                if self.pretraining_include_qa_and_qar:
                    if i < 4:
                        raw_text_a = item["question"]
                        raw_text_b = item['answer_choices'][i]
                    else:
                        raw_text_a = item["question"] + item['answer_choices'][
                            item['answer_label']]
                        raw_text_b = item['rationale_choices'][i - 4]
                elif self.only_qar:
                    raw_text_a = item["question"] + item['answer_choices'][item[
                        'answer_label']]  # This is the correct alignment right now.
                    raw_text_b = item['rationale_choices'][i]
                else:
                    raw_text_a = item["question"]
                    raw_text_b = item['answer_choices'][i]

                true_text_a = examples[i][0].text_a
                true_text_b = examples[i][0].text_b
                text_alignment_a = examples[i][1]
                text_alignment_b = examples[i][2]

                examples_alginment_pack.append(
                    (raw_text_a, raw_text_b, true_text_a, true_text_b,
                     text_alignment_a, text_alignment_b))

            image_box_position = []

            if which is not None:
                raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[
                    which]
                box_record = defaultdict(list)
                self.get_alignment_original(raw_text_a,
                                            text_alignment_a,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1)
                self.get_alignment_original(raw_text_b,
                                            text_alignment_b,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1 + len(text_alignment_a) +
                                            1)
                image_text_alignment = ListField([
                    IntArrayField(np.array(box_record[i]), padding_value=-1)
                    for i in range(len(boxes))
                ])
            else:
                for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack:

                    box_record = defaultdict(list)
                    self.get_alignment_original(raw_text_a,
                                                text_alignment_a,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1)
                    self.get_alignment_original(raw_text_b,
                                                text_alignment_b,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1 +
                                                len(text_alignment_a) + 1)

                    image_box_position.append(
                        ListField([
                            IntArrayField(np.array(box_record[i]),
                                          padding_value=-1)
                            for i in range(len(boxes))
                        ]))

                image_text_alignment = ListField(image_box_position)
            ######################

            instance_dict["image_text_alignment"] = image_text_alignment

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
Exemple #2
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        image_id = int(item['img_id'].split('-')[-1])
        anno_id = str(item['annot_id'].split('-')[-1])
        '''        with h5py.File(self.tag_feature_path, 'r') as h5:
            tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
            tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)
        '''

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(
                    item['question'],
                    grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                    old_det_to_new_ind,
                    item['objects'],
                    token_indexers=self.token_indexers,
                    pad_ind=0 if self.add_image_as_a_box else -1)
                for i in range(4)
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(
                answer,
                grp_items[f'answer_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1)
            for i, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })
        ########## using kg

        ##node

        node_tokenized, node_tags = zip(*[
            _fix_word(i,
                      index,
                      item['annot_id'],
                      self.h5fn_graph,
                      self.h5fn_word,
                      pad_ind=0) for i in range(4)
        ])
        instance_dict['node'] = ListField(node_tokenized)

        ##visual concept
        visual_concept_tokenized, visual_concept_tags = zip(*[
            _fix_visual_concept(item['visual_concept'],
                                item['visual_concept_num'],
                                self.h5fn_word,
                                pad_ind=0) for i in range(4)
        ])
        instance_dict['visual_concept'] = ListField(visual_concept_tokenized)

        ##adj
        adj_result, adj_len = zip(*[
            _fix_adj(i, index, item['annot_id'], self.h5fn_graph, pad_ind=0)
            for i in range(4)
        ])
        instance_dict['adjacent'] = ListField(adj_result)

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        # image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        # image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        # image = to_tensor_and_normalize(image)
        # c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        # segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i])
        # for i in dets2use])
        boxes = np.array(metadata['boxes'])[dets2use, :-1]

        #print('tag_features is box ',index, "___",len(boxes))
        with h5py.File(self.tag_feature_path, 'r') as h5:
            num_boxes = np.array(h5[str(anno_id)]['boxes'],
                                 dtype=np.float32).shape[0]
            tag_features = np.zeros([4, num_boxes, 1024])
            for m in range(4):
                tag_features[m, :, :] = np.array(h5[str(anno_id)]['features' +
                                                                  str(m)],
                                                 dtype=np.float32)

            #tag_features = np.stack(tag_features,np.array(h5[str(anno_id)]['features3'], dtype=np.float32))
            #tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            #tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)

        # Chop off the final dimension, that's the confidence

        # Possibly rescale them if necessary
        # boxes *= img_scale
        # boxes[:, :2] += np.array(padding[:2])[None]
        # boxes[:, 2:] += np.array(padding[:2])[None]
        # obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack(([1, 1, 700, 700], boxes))
            # segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
            # obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        # instance_dict['segms'] = ArrayField(segms, padding_value=0)
        # instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
        # import ipdb
        # ipdb.set_trace()
        # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        # assert np.all((boxes[:, 2] <= w))
        # assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)

        # dean addition
        if self.add_image_as_a_box:
            dets2use = dets2use + 1
            dets2use = np.insert(dets2use, 0, 0)
            # temp = [0]
            # for det_idx in (dets2use+1):
            # temp.append(det_idx)
            # dets2use = np.array(temp)

        final_tag_features = np.zeros([4, len(dets2use), 1024])
        #print(final_tag_features.shape)
        for k in range(final_tag_features.shape[0]):
            convert_ = tag_features[k]
            #print('convert_ : ', convert_.shape, '___det2 : ', len(dets2use))
            convert_2 = convert_[dets2use]
            #print('___convert22 : ',convert_2.shape)

            #print(dets2use)
            #convert_ = convert_[dets2use]

            final_tag_features[k] = convert_2
        #print('fffffinal!! ',final_tag_features.shape)
        assert (final_tag_features[0].shape[0] == boxes.shape[0])
        instance_dict['det_features'] = ArrayField(final_tag_features,
                                                   padding_value=0)
        return None, instance
Exemple #3
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(
                    item['question'],
                    grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                    old_det_to_new_ind,
                    item['objects'],
                    token_indexers=self.token_indexers,
                    pad_ind=0 if self.add_image_as_a_box else -1)
                for i in range(4)
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(
                answer,
                grp_items[f'answer_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1)
            for i, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)

        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number'],
            'img_id':
            item['img_id']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]

        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
 def _index_instance(self, instance: Instance) -> Instance:
     self.reader.apply_token_indexers(instance)
     assert self._vocab is not None
     instance.index_fields(self._vocab)
     return instance
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])
        image_id = int(item['img_id'].split('-')[-1])
    

        with h5py.File(self.tag_feature_path, 'r') as h5:
            tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
            tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)

        with h5py.File(self.non_tag_feature_path, 'r') as h5:
            non_tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)
            non_tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
        ###################################################################
        # Load questions and answers

        non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[item['annot_id']]
        non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[item['annot_id']]
        non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[item['annot_id']]
        
        if self.mode == 'answer':
            question_annotid2detidx =  non_tag_question_annotid2detidx
            answer_annotid2detidx = non_tag_answer_annotid2detidx
        else:
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            q_len = len(item['question'])
            question_annotid2detidx = {}
            for k,v in non_tag_question_annotid2detidx.items():
                question_annotid2detidx[k] = v
            for k,v in non_tag_answer_annotid2detidx[conditioned_label].items():
                question_annotid2detidx[k+q_len] = v
            answer_annotid2detidx = non_tag_rationale_annotid2detidx

        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes))

        if self.add_image_as_a_box:
            assert (len(dets2use) == np.max(old_det_to_new_ind))

        if self.add_image_as_a_box:
            non_tag_old_det_to_new_ind += 1

        # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections
        non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)[0]] += len(dets2use)

        old_det_to_new_ind = old_det_to_new_ind.tolist()
        non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist()
        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_my_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                non_tag_old_det_to_new_ind,
                question_annotid2detidx,
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1,
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_my_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            non_tag_old_det_to_new_ind,
            answer_annotid2detidx[i],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1,
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number'],
                                                   'img_id':item['img_id']})

        ##node
        node_tokenized, node_tags = zip(*[_fix_word(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['node'] = ListField(node_tokenized)

        ##visual concept
        visual_concept_tokenized, visual_concept_tags = zip(*[_fix_visual_concept(
            item['visual_concept'],
            item['visual_concept_num'],
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['visual_concept'] = ListField(visual_concept_tokenized)

        ##adj
        adj_result, adj_len = zip(*[_fix_adj(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['adjacent'] = ListField(adj_result)

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        #image = to_tensor_and_normalize(image)
        #c, h, w = image.shape
        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        tag_boxes = np.array(metadata['boxes'])[dets2use, :-1]
        if self.add_image_as_a_box:
            tag_boxes = np.row_stack(([1,1,700,700], tag_boxes)) # here we just use dummy box for background
        non_tag_boxes = non_tag_boxes[non_tag_dets2use]
        boxes = np.concatenate((tag_boxes, non_tag_boxes))

        if self.add_image_as_a_box:
            dets2use = dets2use + 1
            dets2use = np.insert(dets2use, 0, 0)

        tag_det_features = tag_features[dets2use]
        non_tag_det_features = non_tag_features[non_tag_dets2use]
        det_features = np.concatenate((tag_det_features, non_tag_det_features))

        instance_dict['det_features'] = ArrayField(det_features, padding_value=0)
        assert (det_features.shape[0] == boxes.shape[0])

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return None, instance
Exemple #6
0
    def __getitem__(self, index):
        item = json.loads(self.items[index])
        instance_dict = {}
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        vcr_tokenizer = VCRTokenizer(old_det_to_new_ind, item['objects'], self.add_image_as_a_box)

        ######################################以下是Q2A的数据处理部分##################################################

        with h5py.File(self.h5fn_answer, 'r') as h5:
            grp_items_answer = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}            # (n, 768) dict_keys(['answer_answer0', 'answer_answer1', 'answer_answer2', 'answer_answer3', 'ctx_answer0', 'ctx_answer1', 'ctx_answer2', 'ctx_answer3'])        ['answer_rationale0', 'answer_rationale1', 'answer_rationale2', 'answer_rationale3', 'ctx_rationale0', 'ctx_rationale1', 'ctx_rationale2', 'ctx_rationale3']

        if 'endingonly' not in self.embs_to_load:
            questions_answer_tokenized, question_answer_tags = zip(*[vcr_tokenizer(
                item['question'],
                grp_items_answer[f'ctx_answer{i}']
            ) for i in range(4)])
            instance_dict['question_answer'] = ListField(list(questions_answer_tokenized))
            instance_dict['question_answer_tags'] = ListField(list(question_answer_tags))

        answers_tokenized, answer_tags = zip(*[vcr_tokenizer(
            answer,
            grp_items_answer[f'answer_answer{i}']
        ) for i, answer in enumerate(item['answer_choices'])])

        instance_dict['answers'] = ListField(list(answers_tokenized))
        instance_dict['answer_tags'] = ListField(list(answer_tags))


        ######################################以下是QA2R的数据处理部分################################################
        with h5py.File(self.h5fn_rationale, 'r') as h5_rationale:
            grp_items_rationale = {k: np.array(v, dtype=np.float16) for k, v in h5_rationale[str(index)].items()}

        condition_key = self.conditioned_answer_choice if self.split == "test" else ""
        conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
        question_rationale = item['question'] + item['answer_choices'][conditioned_label]

        if 'endingonly' not in self.embs_to_load:
            questions_rationale_tokenized, question_rationale_tags = zip(*[vcr_tokenizer(
                question_rationale,
                grp_items_rationale[f'ctx_rationale{condition_key}{i}']
            ) for i in range(4)])
            instance_dict['question_rationale'] = ListField(list(questions_rationale_tokenized))
            instance_dict['question_rationale_tags'] = ListField(list(question_rationale_tags))

        rationale_tokenized, rationale_tags = zip(*[vcr_tokenizer(
            rationale,
            grp_items_rationale[f'answer_rationale{condition_key}{i}']
        ) for i, rationale in enumerate(item['rationale_choices'])])

        instance_dict['rationales'] = ListField(list(rationale_tokenized))
        instance_dict['rationale_tags'] = ListField(list(rationale_tags))

        ####################################各种metadata数据处理部分##################################################
        if self.split != 'test':
            instance_dict['answer_label'] = LabelField(item['answer_label'], skip_indexing=True)
            instance_dict['rationale_label'] = LabelField(item['rationale_label'], skip_indexing=True)
        # instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
        #                                            'img_fn': item['img_fn'],
        #                                            'question_number': item['question_number']})

        ##########################################图片处理部分########################################################
        with h5py.File(self.h5fn_image, 'r') as h5_features:
            # pytoch1.1
            img_id = item['img_id'].split('-')[-1]
            group_image = {k: np.array(v) for k, v in h5_features[img_id].items()}
            image_feature = group_image['features'][[0]+(dets2use+1).tolist()]
            tag_boxes = group_image['boxes']
        zeros = np.zeros((1,2048), dtype=np.float32)
        if self.add_image_as_a_box:
            image_feature = np.concatenate((zeros, image_feature), axis=0)
        else:
            image_feature = np.concatenate((zeros, image_feature[1:]), axis=0)
        instance_dict['image_features'] = ArrayField(image_feature, padding_value=0)

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack((boxes[0], boxes))
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels
        # 第一个object是0
        boxes = np.row_stack((boxes[0], boxes))
        obj_labels = [81] + obj_labels

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return instance
Exemple #7
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number']})

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image_id = self.path2id[item['img_fn']]
        image_id_gt = self.path2id_gt[item['img_fn']]
        features, num_boxes, boxes, _ = self._image_features_reader[image_id]
        boxes = boxes[:num_boxes]
        features = features[:num_boxes]
        gt_features, gt_num_boxes, gt_boxes, _ = self._gt_image_features_reader[image_id_gt]

        features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (num_boxes + gt_num_boxes)

        # merge two boxes, and assign the labels.
        gt_boxes = gt_boxes[1:gt_num_boxes]
        gt_features = gt_features[1:gt_num_boxes]
        gt_num_boxes = gt_num_boxes - 1

        gt_box_preserve = min(self._max_region_num - 1, gt_num_boxes)
        gt_boxes = gt_boxes[:gt_box_preserve]
        gt_features = gt_features[:gt_box_preserve]
        gt_num_boxes = gt_box_preserve

        num_box_preserve = min(self._max_region_num - int(gt_num_boxes), int(num_boxes))
        boxes = boxes[:num_box_preserve]
        features = features[:num_box_preserve]

        # concatenate the boxes
        mix_boxes = np.concatenate((boxes, gt_boxes), axis=0)
        mix_features = np.concatenate((features, gt_features), axis=0)
        mix_num_boxes = num_box_preserve + int(gt_num_boxes)

        image_mask = [1] * (mix_num_boxes)
        while len(image_mask) < self._max_region_num:
            image_mask.append(0)
        #
        mix_boxes_pad = np.zeros((self._max_region_num, 5))
        mix_features_pad = np.zeros((self._max_region_num, 3072))
        #
        mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes]

        # appending the target feature.
        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        # spatials = torch.tensor(mix_boxes).float()
        spatials = mix_boxes_pad


        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i])
                          for i in dets2use])

        # # Chop off the final dimension, that's the confidence
        # boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # # Possibly rescale them if necessary
        # boxes *= img_scale
        # boxes[:, :2] += np.array(padding[:2])[None]
        # boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        # if self.add_image_as_a_box:
        #     boxes = np.row_stack((window, boxes))
        #     segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
        #     obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
        #     import ipdb
        #     ipdb.set_trace()
        # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        # assert np.all((boxes[:, 2] <= w))
        # assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(spatials, padding_value=-1)
        instance_dict['box_mask'] = ArrayField(image_mask, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return features, instance