def build_imdb(image_set): print('building imdb %s' % image_set) with open(question_file % image_set) as f: questions = json.load(f)['questions'] abs_image_dir = os.path.abspath(image_dir % image_set) abs_feature_dir = os.path.abspath(feature_dir % image_set) imdb = [None] * len(questions) for n_q, q in enumerate(questions): if (n_q + 1) % 10000 == 0: print('processing %d / %d' % (n_q + 1, len(questions))) image_name = q['image_filename'].split('.')[0] image_path = os.path.join(abs_image_dir, q['image_filename']) feature_path = os.path.join(abs_feature_dir, image_name + '.npy') question_str = q['question'] question_tokens = text_processing.tokenize(question_str) bbox = q['bbox'] if 'bbox' in q else None gt_layout_tokens = q['gt_layout'] if 'gt_layout' in q else None iminfo = dict(image_name=image_name, image_path=image_path, feature_path=feature_path, question_str=question_str, question_tokens=question_tokens, bbox=bbox, gt_layout_tokens=gt_layout_tokens) imdb[n_q] = iminfo return imdb
def build_imdb(image_set): print('building imdb %s' % image_set) if image_set in ['train2014', 'val2014']: load_answer = True load_gt_layout = True with open(annotation_file % image_set) as f: annotations = json.load(f)["annotations"] qid2ann_dict = {ann['question_id']: ann for ann in annotations} qid2layout_dict = np.load(gt_layout_file % image_set)[()] else: load_answer = False load_gt_layout = False with open(question_file % image_set) as f: questions = json.load(f)['questions'] coco_set_name = image_set.replace('-dev', '') abs_image_dir = os.path.abspath(image_dir % coco_set_name) abs_feature_dir = os.path.abspath(feature_dir % coco_set_name) image_name_template = 'COCO_' + coco_set_name + '_%012d' imdb = [None]*len(questions) unk_ans_count = 0 for n_q, q in enumerate(questions): if (n_q+1) % 10000 == 0: print('processing %d / %d' % (n_q+1, len(questions))) image_id = q['image_id'] question_id = q['question_id'] image_name = image_name_template % image_id image_path = os.path.join(abs_image_dir, image_name + '.jpg') feature_path = os.path.join(abs_feature_dir, image_name + '.npy') question_str = q['question'] question_tokens = text_processing.tokenize(question_str) iminfo = dict(image_name=image_name, image_path=image_path, image_id=image_id, question_id=question_id, feature_path=feature_path, question_str=question_str, question_tokens=question_tokens) # load answers if load_answer: ann = qid2ann_dict[question_id] all_answers, valid_answers, soft_score_inds, soft_score_target = \ extract_answers(ann['answers']) if len(valid_answers) == 0: valid_answers = ['<unk>'] unk_ans_count += 1 iminfo['all_answers'] = all_answers iminfo['valid_answers'] = valid_answers iminfo['soft_score_inds'] = soft_score_inds iminfo['soft_score_target'] = soft_score_target if load_gt_layout: gt_layout_tokens = qid2layout_dict[question_id] iminfo['gt_layout_tokens'] = gt_layout_tokens imdb[n_q] = iminfo print('%d out of %d answers are <unk>' % (unk_ans_count, len(questions))) return imdb
def load_one_batch(self, sample_ids): actual_batch_size = len(sample_ids) input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32) seq_length_batch = np.zeros(actual_batch_size, np.int32) if self.load_spatial_feature: spatial_feat_batch = np.zeros((actual_batch_size, self.spatial_D, self.spatial_H, self.spatial_W), np.float32) if self.load_objects_feature or self.load_scene_graph_feature: objects_feat_batch = np.zeros( (actual_batch_size, self.objects_M, self.objects_D), np.float32) objects_bbox_batch = np.zeros( (actual_batch_size, self.objects_M, 4), np.float32) objects_valid_batch = np.zeros((actual_batch_size, self.objects_M), np.bool) qid_list = [None] * actual_batch_size qstr_list = [None] * actual_batch_size imageid_list = [None] * actual_batch_size if self.load_answer: answer_label_batch = np.zeros(actual_batch_size, np.int32) for n in range(len(sample_ids)): iminfo = self.imdb[sample_ids[n]] question_str = iminfo['question'] question_tokens = text_processing.tokenize(question_str) if len(question_tokens) > self.T_encoder: print('data reader: truncating question:\n\t' + question_str) question_tokens = question_tokens[:self.T_encoder] question_inds = [ self.vocab_dict.word2idx(w) for w in question_tokens ] seq_length = len(question_inds) input_seq_batch[:seq_length, n] = question_inds seq_length_batch[n] = seq_length if self.load_spatial_feature: feature = self.spatial_loader.load_feature(iminfo['imageId']) spatial_feat_batch[n:n + 1] = feature if self.load_objects_feature: feature, normalized_bbox, valid = \ self.objects_loader.load_feature_normalized_bbox( iminfo['imageId']) objects_feat_batch[n:n + 1] = feature objects_bbox_batch[n:n + 1] = normalized_bbox objects_valid_batch[n:n + 1] = valid if self.load_scene_graph_feature: feature, normalized_bbox, valid = \ self.scene_graph_loader.load_feature_normalized_bbox( iminfo['imageId']) objects_feat_batch[n:n + 1] = feature objects_bbox_batch[n:n + 1] = normalized_bbox objects_valid_batch[n:n + 1] = valid qid_list[n] = iminfo['questionId'] qstr_list[n] = question_str imageid_list[n] = iminfo['imageId'] if self.load_answer: answer_idx = self.answer_dict.word2idx(iminfo['answer']) answer_label_batch[n] = answer_idx batch = dict(input_seq_batch=input_seq_batch, seq_length_batch=seq_length_batch, qid_list=qid_list, qstr_list=qstr_list, imageid_list=imageid_list) # 'image_feat_batch': N x H x W x C tf.float32 image features # When using objects, then H = 1 & W = 100 # 'image_valid_batch': N x H x W tf.float32, indicating whether # each feature location is real (1) or padding (0) # When using objects, image_valid_batch is 1 on objects & 0 otherwise if self.load_spatial_feature: # NCHW -> NHWC spatial_feat_batch = spatial_feat_batch.transpose((0, 2, 3, 1)) batch['spatial_feat_batch'] = spatial_feat_batch # add positional embedding to the image features pos_enc_tile = np.tile(self.pos_enc, (len(spatial_feat_batch), 1, 1, 1)) image_feat_batch = np.concatenate( (spatial_feat_batch, pos_enc_tile), axis=-1) image_valid_batch = np.ones(image_feat_batch.shape[:3], np.float32) batch['image_feat_batch'] = image_feat_batch batch['image_valid_batch'] = image_valid_batch if self.load_objects_feature or self.load_scene_graph_feature: batch['objects_feat_batch'] = objects_feat_batch batch['objects_bbox_batch'] = objects_bbox_batch # add bounding boxes to the object features # tile bbox to roughly match the l2 norm of R-CNN features objects_bbox_tile = np.tile(objects_bbox_batch, (1, 1, self.bbox_tile_num)) image_feat_batch = np.concatenate( (objects_feat_batch, objects_bbox_tile), axis=-1) image_feat_batch = image_feat_batch[:, np.newaxis, :, :] image_valid_batch = objects_valid_batch[:, np.newaxis, :] * 1. batch['image_feat_batch'] = image_feat_batch batch['image_valid_batch'] = image_valid_batch if self.load_answer: batch['answer_label_batch'] = answer_label_batch return batch