Ejemplo n.º 1
0
def build_imdb(image_set):
    print('building imdb %s' % image_set)
    with open(question_file % image_set) as f:
        questions = json.load(f)['questions']
    abs_image_dir = os.path.abspath(image_dir % image_set)
    abs_feature_dir = os.path.abspath(feature_dir % image_set)
    imdb = [None] * len(questions)
    for n_q, q in enumerate(questions):
        if (n_q + 1) % 10000 == 0:
            print('processing %d / %d' % (n_q + 1, len(questions)))
        image_name = q['image_filename'].split('.')[0]
        image_path = os.path.join(abs_image_dir, q['image_filename'])
        feature_path = os.path.join(abs_feature_dir, image_name + '.npy')
        question_str = q['question']
        question_tokens = text_processing.tokenize(question_str)
        bbox = q['bbox'] if 'bbox' in q else None
        gt_layout_tokens = q['gt_layout'] if 'gt_layout' in q else None

        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      feature_path=feature_path,
                      question_str=question_str,
                      question_tokens=question_tokens,
                      bbox=bbox,
                      gt_layout_tokens=gt_layout_tokens)
        imdb[n_q] = iminfo
    return imdb
Ejemplo n.º 2
0
def build_imdb(image_set):
    print('building imdb %s' % image_set)
    if image_set in ['train2014', 'val2014']:
        load_answer = True
        load_gt_layout = True
        with open(annotation_file % image_set) as f:
            annotations = json.load(f)["annotations"]
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
        qid2layout_dict = np.load(gt_layout_file % image_set)[()]
    else:
        load_answer = False
        load_gt_layout = False
    with open(question_file % image_set) as f:
        questions = json.load(f)['questions']
    coco_set_name = image_set.replace('-dev', '')
    abs_image_dir = os.path.abspath(image_dir % coco_set_name)
    abs_feature_dir = os.path.abspath(feature_dir % coco_set_name)
    image_name_template = 'COCO_' + coco_set_name + '_%012d'
    imdb = [None]*len(questions)

    unk_ans_count = 0
    for n_q, q in enumerate(questions):
        if (n_q+1) % 10000 == 0:
            print('processing %d / %d' % (n_q+1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        image_name = image_name_template % image_id
        image_path = os.path.join(abs_image_dir, image_name + '.jpg')
        feature_path = os.path.join(abs_feature_dir, image_name + '.npy')
        question_str = q['question']
        question_tokens = text_processing.tokenize(question_str)

        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      image_id=image_id,
                      question_id=question_id,
                      feature_path=feature_path,
                      question_str=question_str,
                      question_tokens=question_tokens)

        # load answers
        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers, soft_score_inds, soft_score_target = \
                extract_answers(ann['answers'])
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers
            iminfo['soft_score_inds'] = soft_score_inds
            iminfo['soft_score_target'] = soft_score_target

        if load_gt_layout:
            gt_layout_tokens = qid2layout_dict[question_id]
            iminfo['gt_layout_tokens'] = gt_layout_tokens

        imdb[n_q] = iminfo
    print('%d out of %d answers are <unk>' % (unk_ans_count, len(questions)))
    return imdb
    def load_one_batch(self, sample_ids):
        actual_batch_size = len(sample_ids)
        input_seq_batch = np.zeros((self.T_encoder, actual_batch_size),
                                   np.int32)
        seq_length_batch = np.zeros(actual_batch_size, np.int32)
        if self.load_spatial_feature:
            spatial_feat_batch = np.zeros((actual_batch_size, self.spatial_D,
                                           self.spatial_H, self.spatial_W),
                                          np.float32)
        if self.load_objects_feature or self.load_scene_graph_feature:
            objects_feat_batch = np.zeros(
                (actual_batch_size, self.objects_M, self.objects_D),
                np.float32)
            objects_bbox_batch = np.zeros(
                (actual_batch_size, self.objects_M, 4), np.float32)
            objects_valid_batch = np.zeros((actual_batch_size, self.objects_M),
                                           np.bool)

        qid_list = [None] * actual_batch_size
        qstr_list = [None] * actual_batch_size
        imageid_list = [None] * actual_batch_size
        if self.load_answer:
            answer_label_batch = np.zeros(actual_batch_size, np.int32)
        for n in range(len(sample_ids)):
            iminfo = self.imdb[sample_ids[n]]
            question_str = iminfo['question']
            question_tokens = text_processing.tokenize(question_str)
            if len(question_tokens) > self.T_encoder:
                print('data reader: truncating question:\n\t' + question_str)
                question_tokens = question_tokens[:self.T_encoder]
            question_inds = [
                self.vocab_dict.word2idx(w) for w in question_tokens
            ]
            seq_length = len(question_inds)
            input_seq_batch[:seq_length, n] = question_inds
            seq_length_batch[n] = seq_length

            if self.load_spatial_feature:
                feature = self.spatial_loader.load_feature(iminfo['imageId'])
                spatial_feat_batch[n:n + 1] = feature
            if self.load_objects_feature:
                feature, normalized_bbox, valid = \
                    self.objects_loader.load_feature_normalized_bbox(
                        iminfo['imageId'])
                objects_feat_batch[n:n + 1] = feature
                objects_bbox_batch[n:n + 1] = normalized_bbox
                objects_valid_batch[n:n + 1] = valid
            if self.load_scene_graph_feature:
                feature, normalized_bbox, valid = \
                    self.scene_graph_loader.load_feature_normalized_bbox(
                        iminfo['imageId'])
                objects_feat_batch[n:n + 1] = feature
                objects_bbox_batch[n:n + 1] = normalized_bbox
                objects_valid_batch[n:n + 1] = valid

            qid_list[n] = iminfo['questionId']
            qstr_list[n] = question_str
            imageid_list[n] = iminfo['imageId']
            if self.load_answer:
                answer_idx = self.answer_dict.word2idx(iminfo['answer'])
                answer_label_batch[n] = answer_idx
        batch = dict(input_seq_batch=input_seq_batch,
                     seq_length_batch=seq_length_batch,
                     qid_list=qid_list,
                     qstr_list=qstr_list,
                     imageid_list=imageid_list)

        # 'image_feat_batch': N x H x W x C tf.float32 image features
        #   When using objects, then H = 1 & W = 100
        # 'image_valid_batch': N x H x W tf.float32, indicating whether
        #   each feature location is real (1) or padding (0)
        #   When using objects, image_valid_batch is 1 on objects & 0 otherwise
        if self.load_spatial_feature:
            # NCHW -> NHWC
            spatial_feat_batch = spatial_feat_batch.transpose((0, 2, 3, 1))
            batch['spatial_feat_batch'] = spatial_feat_batch
            # add positional embedding to the image features
            pos_enc_tile = np.tile(self.pos_enc,
                                   (len(spatial_feat_batch), 1, 1, 1))
            image_feat_batch = np.concatenate(
                (spatial_feat_batch, pos_enc_tile), axis=-1)
            image_valid_batch = np.ones(image_feat_batch.shape[:3], np.float32)
            batch['image_feat_batch'] = image_feat_batch
            batch['image_valid_batch'] = image_valid_batch
        if self.load_objects_feature or self.load_scene_graph_feature:
            batch['objects_feat_batch'] = objects_feat_batch
            batch['objects_bbox_batch'] = objects_bbox_batch
            # add bounding boxes to the object features
            # tile bbox to roughly match the l2 norm of R-CNN features
            objects_bbox_tile = np.tile(objects_bbox_batch,
                                        (1, 1, self.bbox_tile_num))
            image_feat_batch = np.concatenate(
                (objects_feat_batch, objects_bbox_tile), axis=-1)
            image_feat_batch = image_feat_batch[:, np.newaxis, :, :]
            image_valid_batch = objects_valid_batch[:, np.newaxis, :] * 1.
            batch['image_feat_batch'] = image_feat_batch
            batch['image_valid_batch'] = image_valid_batch
        if self.load_answer:
            batch['answer_label_batch'] = answer_label_batch
        return batch