def load_feat_sent(self, entry): image_id = entry["image_id"] features, num_boxes, boxes = utils.image_features_reader(self.feat_folder, image_id) mix_num_boxes = min(int(num_boxes), self.max_region_num) mix_boxes_pad = np.zeros((self.max_region_num, 5)) mix_features_pad = np.zeros((self.max_region_num, 2048)) image_mask = [1] * (int(mix_num_boxes)) while len(image_mask) < self.max_region_num: image_mask.append(0) mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes] features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() spatials = torch.tensor(mix_boxes_pad).float() caption_arr = [] input_mask_arr = [] segment_ids_arr = [] for i in range(len(entry["token"])): caption_arr.append(torch.from_numpy(np.array(entry["token"][i]))) input_mask_arr.append(torch.from_numpy(np.array(entry["input_mask"][i]))) segment_ids_arr.append(torch.from_numpy(np.array(entry["segment_ids"][i]))) caption = torch.stack(caption_arr, dim=0) input_mask = torch.stack(input_mask_arr, dim=0) segment_ids = torch.stack(segment_ids_arr, dim=0) return features, image_mask, spatials, caption, input_mask, segment_ids
def __getitem__(self, index): image_id = self.image_id[index] feat_folder = self.feat_folder features, num_boxes, boxes = utils.image_features_reader( feat_folder, image_id) mix_num_boxes = min(int(num_boxes), self.max_region_num) mix_boxes_pad = np.zeros((self.max_region_num, 5)) mix_features_pad = np.zeros((self.max_region_num, 2048)) image_mask = [1] * (int(mix_num_boxes)) while len(image_mask) < self.max_region_num: image_mask.append(0) mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes] features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() spatials = torch.tensor(mix_boxes_pad).float() input_seq = np.array([cfg.MODEL.CLS_ID] * self.max_seq_length) target_seq = np.array([-1] * self.max_seq_length) segment_ids = np.array([1] * self.max_seq_length) input_mask = torch.tril( torch.ones((self.max_seq_length, self.max_seq_length), dtype=torch.long)) return ( features, spatials, image_mask, input_seq, target_seq, input_mask, segment_ids, image_id, )
def __getitem__(self, index): entry = self.entries[index] image_id = entry["image_id"] feat_folder = self.feat_folder features, num_boxes, boxes = utils.image_features_reader( feat_folder, image_id) mix_num_boxes = min(int(num_boxes), self.max_region_num) mix_boxes_pad = np.zeros((self.max_region_num, 5)) mix_features_pad = np.zeros((self.max_region_num, 2048)) image_mask = [1] * (int(mix_num_boxes)) while len(image_mask) < self.max_region_num: image_mask.append(0) mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes] features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() spatials = torch.tensor(mix_boxes_pad).float() input_seq = np.zeros((self.seq_per_img, self.max_seq_length), dtype='int') target_seq = np.zeros((self.seq_per_img, self.max_seq_length), dtype='int') segment_ids = np.zeros((self.seq_per_img, self.max_seq_length), dtype='int') if self.split == 'train': sents_num = len(entry['input_seq']) if sents_num >= self.seq_per_img: sid = 0 ixs = random.sample(range(sents_num), self.seq_per_img) else: sid = sents_num ixs = random.sample(range(sents_num), self.seq_per_img - sents_num) input_seq[0:sents_num, :] = entry['input_seq'] target_seq[0:sents_num, :] = entry['target_seq'] segment_ids[0:sents_num, :] = entry['segment_ids'] for i, ix in enumerate(ixs): input_seq[sid + i] = entry['input_seq'][ix] target_seq[sid + i] = entry['target_seq'][ix] segment_ids[sid + i] = entry['segment_ids'][ix] input_mask = torch.tril( torch.ones((self.max_seq_length, self.max_seq_length), dtype=torch.long)) input_mask = input_mask.unsqueeze(0).expand( [self.seq_per_img, self.max_seq_length, self.max_seq_length]) else: input_seq = np.array([cfg.MODEL.CLS_ID] * self.max_seq_length) target_seq = np.array([-1] * self.max_seq_length) segment_ids = np.array([1] * self.max_seq_length) input_mask = torch.tril( torch.ones((self.max_seq_length, self.max_seq_length), dtype=torch.long)) return ( features, spatials, image_mask, input_seq, target_seq, input_mask, segment_ids, image_id, )
def __getitem__(self, index): entry = self.entries[index] anno_id = entry["anno_id"] img_query = entry["metadata_fn"][:-5] features, num_boxes, boxes = utils.image_features_reader( self.feat_folder, img_query) gt_features, gt_num_boxes, gt_boxes = utils.image_features_reader( self.gt_feat_folder, img_query) # merge two features. features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (num_boxes + gt_num_boxes) # merge two boxes, and assign the labels. gt_boxes = gt_boxes[1:gt_num_boxes] gt_features = gt_features[1:gt_num_boxes] gt_num_boxes = gt_num_boxes - 1 gt_box_preserve = min(self.max_region_num - 1, gt_num_boxes) gt_boxes = gt_boxes[:gt_box_preserve] gt_features = gt_features[:gt_box_preserve] gt_num_boxes = gt_box_preserve num_box_preserve = min(self.max_region_num - int(gt_num_boxes), int(num_boxes)) boxes = boxes[:num_box_preserve] features = features[:num_box_preserve] # concatenate the boxes mix_boxes = np.concatenate((boxes, gt_boxes), axis=0) mix_features = np.concatenate((features, gt_features), axis=0) mix_num_boxes = num_box_preserve + int(gt_num_boxes) image_mask = [1] * (mix_num_boxes) while len(image_mask) < self.max_region_num: image_mask.append(0) mix_boxes_pad = np.zeros((self.max_region_num, 5)) mix_features_pad = np.zeros((self.max_region_num, 2048)) mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes] # appending the target feature. features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() spatials = torch.tensor(mix_boxes_pad).float() input_ids = torch.from_numpy(np.array(entry["input_ids"])) input_mask = torch.from_numpy(np.array(entry["input_mask"])) segment_ids = torch.from_numpy(np.array(entry["segment_ids"])) target = 0 return ( features, spatials, image_mask, input_ids, target, input_mask, segment_ids, anno_id, )