Example #1
0
    def load_feat_sent(self, entry):
        image_id = entry["image_id"]
        features, num_boxes, boxes = utils.image_features_reader(self.feat_folder, image_id)

        mix_num_boxes = min(int(num_boxes), self.max_region_num)
        mix_boxes_pad = np.zeros((self.max_region_num, 5))
        mix_features_pad = np.zeros((self.max_region_num, 2048))

        image_mask = [1] * (int(mix_num_boxes))
        while len(image_mask) < self.max_region_num:
            image_mask.append(0)

        mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]

        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        spatials = torch.tensor(mix_boxes_pad).float()

        caption_arr = []
        input_mask_arr = []
        segment_ids_arr = []
        for i in range(len(entry["token"])):
            caption_arr.append(torch.from_numpy(np.array(entry["token"][i])))
            input_mask_arr.append(torch.from_numpy(np.array(entry["input_mask"][i])))
            segment_ids_arr.append(torch.from_numpy(np.array(entry["segment_ids"][i])))

        caption = torch.stack(caption_arr, dim=0)
        input_mask = torch.stack(input_mask_arr, dim=0)
        segment_ids = torch.stack(segment_ids_arr, dim=0)

        return features, image_mask, spatials, caption, input_mask, segment_ids
Example #2
0
    def __getitem__(self, index):
        image_id = self.image_id[index]

        feat_folder = self.feat_folder
        features, num_boxes, boxes = utils.image_features_reader(
            feat_folder, image_id)

        mix_num_boxes = min(int(num_boxes), self.max_region_num)
        mix_boxes_pad = np.zeros((self.max_region_num, 5))
        mix_features_pad = np.zeros((self.max_region_num, 2048))

        image_mask = [1] * (int(mix_num_boxes))
        while len(image_mask) < self.max_region_num:
            image_mask.append(0)

        mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]

        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        spatials = torch.tensor(mix_boxes_pad).float()

        input_seq = np.array([cfg.MODEL.CLS_ID] * self.max_seq_length)
        target_seq = np.array([-1] * self.max_seq_length)
        segment_ids = np.array([1] * self.max_seq_length)
        input_mask = torch.tril(
            torch.ones((self.max_seq_length, self.max_seq_length),
                       dtype=torch.long))

        return (
            features,
            spatials,
            image_mask,
            input_seq,
            target_seq,
            input_mask,
            segment_ids,
            image_id,
        )
Example #3
0
    def __getitem__(self, index):
        entry = self.entries[index]
        image_id = entry["image_id"]

        feat_folder = self.feat_folder
        features, num_boxes, boxes = utils.image_features_reader(
            feat_folder, image_id)

        mix_num_boxes = min(int(num_boxes), self.max_region_num)
        mix_boxes_pad = np.zeros((self.max_region_num, 5))
        mix_features_pad = np.zeros((self.max_region_num, 2048))

        image_mask = [1] * (int(mix_num_boxes))
        while len(image_mask) < self.max_region_num:
            image_mask.append(0)

        mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]

        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        spatials = torch.tensor(mix_boxes_pad).float()

        input_seq = np.zeros((self.seq_per_img, self.max_seq_length),
                             dtype='int')
        target_seq = np.zeros((self.seq_per_img, self.max_seq_length),
                              dtype='int')
        segment_ids = np.zeros((self.seq_per_img, self.max_seq_length),
                               dtype='int')

        if self.split == 'train':
            sents_num = len(entry['input_seq'])
            if sents_num >= self.seq_per_img:
                sid = 0
                ixs = random.sample(range(sents_num), self.seq_per_img)
            else:
                sid = sents_num
                ixs = random.sample(range(sents_num),
                                    self.seq_per_img - sents_num)
                input_seq[0:sents_num, :] = entry['input_seq']
                target_seq[0:sents_num, :] = entry['target_seq']
                segment_ids[0:sents_num, :] = entry['segment_ids']

            for i, ix in enumerate(ixs):
                input_seq[sid + i] = entry['input_seq'][ix]
                target_seq[sid + i] = entry['target_seq'][ix]
                segment_ids[sid + i] = entry['segment_ids'][ix]

            input_mask = torch.tril(
                torch.ones((self.max_seq_length, self.max_seq_length),
                           dtype=torch.long))
            input_mask = input_mask.unsqueeze(0).expand(
                [self.seq_per_img, self.max_seq_length, self.max_seq_length])

        else:
            input_seq = np.array([cfg.MODEL.CLS_ID] * self.max_seq_length)
            target_seq = np.array([-1] * self.max_seq_length)
            segment_ids = np.array([1] * self.max_seq_length)
            input_mask = torch.tril(
                torch.ones((self.max_seq_length, self.max_seq_length),
                           dtype=torch.long))

        return (
            features,
            spatials,
            image_mask,
            input_seq,
            target_seq,
            input_mask,
            segment_ids,
            image_id,
        )
Example #4
0
    def __getitem__(self, index):
        entry = self.entries[index]
        anno_id = entry["anno_id"]
        img_query = entry["metadata_fn"][:-5]
        features, num_boxes, boxes = utils.image_features_reader(
            self.feat_folder, img_query)
        gt_features, gt_num_boxes, gt_boxes = utils.image_features_reader(
            self.gt_feat_folder, img_query)

        # merge two features.
        features[0] = (features[0] * num_boxes + gt_features[0] *
                       gt_num_boxes) / (num_boxes + gt_num_boxes)

        # merge two boxes, and assign the labels.
        gt_boxes = gt_boxes[1:gt_num_boxes]
        gt_features = gt_features[1:gt_num_boxes]
        gt_num_boxes = gt_num_boxes - 1

        gt_box_preserve = min(self.max_region_num - 1, gt_num_boxes)
        gt_boxes = gt_boxes[:gt_box_preserve]
        gt_features = gt_features[:gt_box_preserve]
        gt_num_boxes = gt_box_preserve

        num_box_preserve = min(self.max_region_num - int(gt_num_boxes),
                               int(num_boxes))
        boxes = boxes[:num_box_preserve]
        features = features[:num_box_preserve]

        # concatenate the boxes
        mix_boxes = np.concatenate((boxes, gt_boxes), axis=0)
        mix_features = np.concatenate((features, gt_features), axis=0)
        mix_num_boxes = num_box_preserve + int(gt_num_boxes)

        image_mask = [1] * (mix_num_boxes)
        while len(image_mask) < self.max_region_num:
            image_mask.append(0)

        mix_boxes_pad = np.zeros((self.max_region_num, 5))
        mix_features_pad = np.zeros((self.max_region_num, 2048))

        mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes]
        mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes]

        # appending the target feature.
        features = torch.tensor(mix_features_pad).float()
        image_mask = torch.tensor(image_mask).long()
        spatials = torch.tensor(mix_boxes_pad).float()

        input_ids = torch.from_numpy(np.array(entry["input_ids"]))
        input_mask = torch.from_numpy(np.array(entry["input_mask"]))
        segment_ids = torch.from_numpy(np.array(entry["segment_ids"]))
        target = 0

        return (
            features,
            spatials,
            image_mask,
            input_ids,
            target,
            input_mask,
            segment_ids,
            anno_id,
        )