def __getitem__(self, index): if self.complete_shuffle: if self.pretraining_include_qa_and_qar: index = index // 8 which = index % 8 else: index = index // 4 which = index % 4 else: which = None item = deepcopy(self.items[index]) ################################################################### # Load questions and answers answer_choices = item['{}_choices'.format(self.mode)] if self.complete_shuffle and which < 4: only_use_answer = True else: only_use_answer = False if self.complete_shuffle and which >= 4: only_use_qar = True else: only_use_qar = False dets2use, old_det_to_new_ind = self._get_dets_to_use( item, only_use_answer=only_use_answer, only_use_qar=only_use_qar) # The only_use_qar is ambigious... instance_dict = {} if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(self.vcr_image_dir, item['img_fn'])) #image = self.imagedatas(item['img_fn']) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(self.vcr_image_dir, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels examples = data_iter_item( item, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, endingonly=False, include_qar=self.pretraining_include_qa_and_qar, only_qar=self.only_qar) self.getitem_bert_part(examples, item, instance_dict, which) if self.use_alignment: # Alignment between objects and text ###################### examples_alginment_pack = [] for i in range(len(examples)): if self.pretraining_include_qa_and_qar: if i < 4: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] else: raw_text_a = item["question"] + item['answer_choices'][ item['answer_label']] raw_text_b = item['rationale_choices'][i - 4] elif self.only_qar: raw_text_a = item["question"] + item['answer_choices'][item[ 'answer_label']] # This is the correct alignment right now. raw_text_b = item['rationale_choices'][i] else: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] true_text_a = examples[i][0].text_a true_text_b = examples[i][0].text_b text_alignment_a = examples[i][1] text_alignment_b = examples[i][2] examples_alginment_pack.append( (raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b)) image_box_position = [] if which is not None: raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[ which] box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_text_alignment = ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ]) else: for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack: box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_box_position.append( ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ])) image_text_alignment = ListField(image_box_position) ###################### instance_dict["image_text_alignment"] = image_text_alignment instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) image_id = int(item['img_id'].split('-')[-1]) anno_id = str(item['annot_id'].split('-')[-1]) ''' with h5py.File(self.tag_feature_path, 'r') as h5: tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32) tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) ''' ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[ _fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i in range(4) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[ _fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ########## using kg ##node node_tokenized, node_tags = zip(*[ _fix_word(i, index, item['annot_id'], self.h5fn_graph, self.h5fn_word, pad_ind=0) for i in range(4) ]) instance_dict['node'] = ListField(node_tokenized) ##visual concept visual_concept_tokenized, visual_concept_tags = zip(*[ _fix_visual_concept(item['visual_concept'], item['visual_concept_num'], self.h5fn_word, pad_ind=0) for i in range(4) ]) instance_dict['visual_concept'] = ListField(visual_concept_tokenized) ##adj adj_result, adj_len = zip(*[ _fix_adj(i, index, item['annot_id'], self.h5fn_graph, pad_ind=0) for i in range(4) ]) instance_dict['adjacent'] = ListField(adj_result) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. # image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) # image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) # image = to_tensor_and_normalize(image) # c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] # segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) # for i in dets2use]) boxes = np.array(metadata['boxes'])[dets2use, :-1] #print('tag_features is box ',index, "___",len(boxes)) with h5py.File(self.tag_feature_path, 'r') as h5: num_boxes = np.array(h5[str(anno_id)]['boxes'], dtype=np.float32).shape[0] tag_features = np.zeros([4, num_boxes, 1024]) for m in range(4): tag_features[m, :, :] = np.array(h5[str(anno_id)]['features' + str(m)], dtype=np.float32) #tag_features = np.stack(tag_features,np.array(h5[str(anno_id)]['features3'], dtype=np.float32)) #tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) #tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) # Chop off the final dimension, that's the confidence # Possibly rescale them if necessary # boxes *= img_scale # boxes[:, :2] += np.array(padding[:2])[None] # boxes[:, 2:] += np.array(padding[:2])[None] # obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] if self.add_image_as_a_box: boxes = np.row_stack(([1, 1, 700, 700], boxes)) # segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) # obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels # instance_dict['segms'] = ArrayField(segms, padding_value=0) # instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): # import ipdb # ipdb.set_trace() # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) # assert np.all((boxes[:, 2] <= w)) # assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) # dean addition if self.add_image_as_a_box: dets2use = dets2use + 1 dets2use = np.insert(dets2use, 0, 0) # temp = [0] # for det_idx in (dets2use+1): # temp.append(det_idx) # dets2use = np.array(temp) final_tag_features = np.zeros([4, len(dets2use), 1024]) #print(final_tag_features.shape) for k in range(final_tag_features.shape[0]): convert_ = tag_features[k] #print('convert_ : ', convert_.shape, '___det2 : ', len(dets2use)) convert_2 = convert_[dets2use] #print('___convert22 : ',convert_2.shape) #print(dets2use) #convert_ = convert_[dets2use] final_tag_features[k] = convert_2 #print('fffffinal!! ',final_tag_features.shape) assert (final_tag_features[0].shape[0] == boxes.shape[0]) instance_dict['det_features'] = ArrayField(final_tag_features, padding_value=0) return None, instance
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[ _fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i in range(4) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[ _fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'], 'img_id': item['img_id'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def _index_instance(self, instance: Instance) -> Instance: self.reader.apply_token_indexers(instance) assert self._vocab is not None instance.index_fields(self._vocab) return instance
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) image_id = int(item['img_id'].split('-')[-1]) with h5py.File(self.tag_feature_path, 'r') as h5: tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32) tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) with h5py.File(self.non_tag_feature_path, 'r') as h5: non_tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) non_tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32) ################################################################### # Load questions and answers non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[item['annot_id']] non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[item['annot_id']] non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[item['annot_id']] if self.mode == 'answer': question_annotid2detidx = non_tag_question_annotid2detidx answer_annotid2detidx = non_tag_answer_annotid2detidx else: conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice q_len = len(item['question']) question_annotid2detidx = {} for k,v in non_tag_question_annotid2detidx.items(): question_annotid2detidx[k] = v for k,v in non_tag_answer_annotid2detidx[conditioned_label].items(): question_annotid2detidx[k+q_len] = v answer_annotid2detidx = non_tag_rationale_annotid2detidx if self.mode == 'rationale': conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes)) if self.add_image_as_a_box: assert (len(dets2use) == np.max(old_det_to_new_ind)) if self.add_image_as_a_box: non_tag_old_det_to_new_ind += 1 # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)[0]] += len(dets2use) old_det_to_new_ind = old_det_to_new_ind.tolist() non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist() ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[_my_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, question_annotid2detidx, token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i in range(4)]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[_my_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, answer_annotid2detidx[i], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i, answer in enumerate(answer_choices)]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'], 'img_id':item['img_id']}) ##node node_tokenized, node_tags = zip(*[_fix_word( i, index, item['annot_id'], self.h5fn_graph, self.h5fn_word, pad_ind=0 ) for i in range(4)]) instance_dict['node'] = ListField(node_tokenized) ##visual concept visual_concept_tokenized, visual_concept_tags = zip(*[_fix_visual_concept( item['visual_concept'], item['visual_concept_num'], self.h5fn_word, pad_ind=0 ) for i in range(4)]) instance_dict['visual_concept'] = ListField(visual_concept_tokenized) ##adj adj_result, adj_len = zip(*[_fix_adj( i, index, item['annot_id'], self.h5fn_graph, pad_ind=0 ) for i in range(4)]) instance_dict['adjacent'] = ListField(adj_result) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) #image = to_tensor_and_normalize(image) #c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence tag_boxes = np.array(metadata['boxes'])[dets2use, :-1] if self.add_image_as_a_box: tag_boxes = np.row_stack(([1,1,700,700], tag_boxes)) # here we just use dummy box for background non_tag_boxes = non_tag_boxes[non_tag_dets2use] boxes = np.concatenate((tag_boxes, non_tag_boxes)) if self.add_image_as_a_box: dets2use = dets2use + 1 dets2use = np.insert(dets2use, 0, 0) tag_det_features = tag_features[dets2use] non_tag_det_features = non_tag_features[non_tag_dets2use] det_features = np.concatenate((tag_det_features, non_tag_det_features)) instance_dict['det_features'] = ArrayField(det_features, padding_value=0) assert (det_features.shape[0] == boxes.shape[0]) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return None, instance
def __getitem__(self, index): item = json.loads(self.items[index]) instance_dict = {} dets2use, old_det_to_new_ind = self._get_dets_to_use(item) vcr_tokenizer = VCRTokenizer(old_det_to_new_ind, item['objects'], self.add_image_as_a_box) ######################################以下是Q2A的数据处理部分################################################## with h5py.File(self.h5fn_answer, 'r') as h5: grp_items_answer = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # (n, 768) dict_keys(['answer_answer0', 'answer_answer1', 'answer_answer2', 'answer_answer3', 'ctx_answer0', 'ctx_answer1', 'ctx_answer2', 'ctx_answer3']) ['answer_rationale0', 'answer_rationale1', 'answer_rationale2', 'answer_rationale3', 'ctx_rationale0', 'ctx_rationale1', 'ctx_rationale2', 'ctx_rationale3'] if 'endingonly' not in self.embs_to_load: questions_answer_tokenized, question_answer_tags = zip(*[vcr_tokenizer( item['question'], grp_items_answer[f'ctx_answer{i}'] ) for i in range(4)]) instance_dict['question_answer'] = ListField(list(questions_answer_tokenized)) instance_dict['question_answer_tags'] = ListField(list(question_answer_tags)) answers_tokenized, answer_tags = zip(*[vcr_tokenizer( answer, grp_items_answer[f'answer_answer{i}'] ) for i, answer in enumerate(item['answer_choices'])]) instance_dict['answers'] = ListField(list(answers_tokenized)) instance_dict['answer_tags'] = ListField(list(answer_tags)) ######################################以下是QA2R的数据处理部分################################################ with h5py.File(self.h5fn_rationale, 'r') as h5_rationale: grp_items_rationale = {k: np.array(v, dtype=np.float16) for k, v in h5_rationale[str(index)].items()} condition_key = self.conditioned_answer_choice if self.split == "test" else "" conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice question_rationale = item['question'] + item['answer_choices'][conditioned_label] if 'endingonly' not in self.embs_to_load: questions_rationale_tokenized, question_rationale_tags = zip(*[vcr_tokenizer( question_rationale, grp_items_rationale[f'ctx_rationale{condition_key}{i}'] ) for i in range(4)]) instance_dict['question_rationale'] = ListField(list(questions_rationale_tokenized)) instance_dict['question_rationale_tags'] = ListField(list(question_rationale_tags)) rationale_tokenized, rationale_tags = zip(*[vcr_tokenizer( rationale, grp_items_rationale[f'answer_rationale{condition_key}{i}'] ) for i, rationale in enumerate(item['rationale_choices'])]) instance_dict['rationales'] = ListField(list(rationale_tokenized)) instance_dict['rationale_tags'] = ListField(list(rationale_tags)) ####################################各种metadata数据处理部分################################################## if self.split != 'test': instance_dict['answer_label'] = LabelField(item['answer_label'], skip_indexing=True) instance_dict['rationale_label'] = LabelField(item['rationale_label'], skip_indexing=True) # instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], # 'img_fn': item['img_fn'], # 'question_number': item['question_number']}) ##########################################图片处理部分######################################################## with h5py.File(self.h5fn_image, 'r') as h5_features: # pytoch1.1 img_id = item['img_id'].split('-')[-1] group_image = {k: np.array(v) for k, v in h5_features[img_id].items()} image_feature = group_image['features'][[0]+(dets2use+1).tolist()] tag_boxes = group_image['boxes'] zeros = np.zeros((1,2048), dtype=np.float32) if self.add_image_as_a_box: image_feature = np.concatenate((zeros, image_feature), axis=0) else: image_feature = np.concatenate((zeros, image_feature[1:]), axis=0) instance_dict['image_features'] = ArrayField(image_feature, padding_value=0) ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] if self.add_image_as_a_box: boxes = np.row_stack((boxes[0], boxes)) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels # 第一个object是0 boxes = np.row_stack((boxes[0], boxes)) obj_labels = [81] + obj_labels instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) instance = Instance(instance_dict) instance.index_fields(self.vocab) return instance
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i in range(4)]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i, answer in enumerate(answer_choices)]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number']}) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image_id = self.path2id[item['img_fn']] image_id_gt = self.path2id_gt[item['img_fn']] features, num_boxes, boxes, _ = self._image_features_reader[image_id] boxes = boxes[:num_boxes] features = features[:num_boxes] gt_features, gt_num_boxes, gt_boxes, _ = self._gt_image_features_reader[image_id_gt] features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (num_boxes + gt_num_boxes) # merge two boxes, and assign the labels. gt_boxes = gt_boxes[1:gt_num_boxes] gt_features = gt_features[1:gt_num_boxes] gt_num_boxes = gt_num_boxes - 1 gt_box_preserve = min(self._max_region_num - 1, gt_num_boxes) gt_boxes = gt_boxes[:gt_box_preserve] gt_features = gt_features[:gt_box_preserve] gt_num_boxes = gt_box_preserve num_box_preserve = min(self._max_region_num - int(gt_num_boxes), int(num_boxes)) boxes = boxes[:num_box_preserve] features = features[:num_box_preserve] # concatenate the boxes mix_boxes = np.concatenate((boxes, gt_boxes), axis=0) mix_features = np.concatenate((features, gt_features), axis=0) mix_num_boxes = num_box_preserve + int(gt_num_boxes) image_mask = [1] * (mix_num_boxes) while len(image_mask) < self._max_region_num: image_mask.append(0) # mix_boxes_pad = np.zeros((self._max_region_num, 5)) mix_features_pad = np.zeros((self._max_region_num, 3072)) # mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes] # appending the target feature. features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() # spatials = torch.tensor(mix_boxes).float() spatials = mix_boxes_pad # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # # Chop off the final dimension, that's the confidence # boxes = np.array(metadata['boxes'])[dets2use, :-1] # # Possibly rescale them if necessary # boxes *= img_scale # boxes[:, :2] += np.array(padding[:2])[None] # boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] # if self.add_image_as_a_box: # boxes = np.row_stack((window, boxes)) # segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) # obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): # import ipdb # ipdb.set_trace() # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) # assert np.all((boxes[:, 2] <= w)) # assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(spatials, padding_value=-1) instance_dict['box_mask'] = ArrayField(image_mask, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return features, instance