def __getitem__(self, index): ix = self.split_ix[index] # load image here. image_id = self.info['images'][ix]['id'] file_path = self.info['images'][ix]['file_path'] # load the proposal file # proposal_file = self.proposal_file[image_id] num_proposal = int(self.num_proposals[ix]) num_nms = int(self.num_nms[ix]) proposals = self.label_proposals[ix] proposals = proposals[:num_nms, :] coco_split = file_path.split('/')[0] # get the ground truth bounding box. if coco_split == 'train2014': coco = self.coco_train else: coco = self.coco_val bbox_ann_ids = coco.getAnnIds(imgIds=image_id) bbox_ann = [{ 'label': self.ctol[i['category_id']], 'bbox': i['bbox'] } for i in coco.loadAnns(bbox_ann_ids)] gt_bboxs = np.zeros((len(bbox_ann), 5)) for i, bbox in enumerate(bbox_ann): gt_bboxs[i, :4] = bbox['bbox'] gt_bboxs[i, 4] = bbox['label'] # convert from x,y,w,h to x_min, y_min, x_max, y_max gt_bboxs[:, 2] = gt_bboxs[:, 2] + gt_bboxs[:, 0] gt_bboxs[:, 3] = gt_bboxs[:, 3] + gt_bboxs[:, 1] # load the image. img = Image.open(os.path.join(self.opt.image_path, file_path)).convert('RGB') width, height = img.size # resize the image. img = self.Resize(img) if self.split == 'train': # resize the gt_bboxs and proposals. proposals = utils.resize_bbox(proposals, width, height, self.opt.image_size, self.opt.image_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_size, self.opt.image_size) else: proposals = utils.resize_bbox(proposals, width, height, self.opt.image_crop_size, self.opt.image_crop_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_crop_size, self.opt.image_crop_size) # crop the image and the bounding box. img, proposals, gt_bboxs = self.RandomCropWithBbox( img, proposals, gt_bboxs) gt_x = (gt_bboxs[:, 2] - gt_bboxs[:, 0] + 1) gt_y = (gt_bboxs[:, 3] - gt_bboxs[:, 1] + 1) gt_area_nonzero = (((gt_x != 1) & (gt_y != 1))) gt_bboxs = gt_bboxs[gt_area_nonzero] captions = self.caption_file[ix] # given the bbox_ann, and caption, this function determine which word belongs to the detection. det_indicator = self.get_det_word(gt_bboxs, captions) # fetch the captions ncap = len(captions) # number of captions available for this image assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t' # convert caption into sequence label. cap_seq = np.zeros([ncap, self.seq_length, 5]) for i, caption in enumerate(captions): j = 0 k = 0 while j < len(caption) and j < self.seq_length: is_det = False for n in range( 2, 0, -1 ): # the object label at most has two words, so 2-gram if det_indicator[n][i][j][0] != 0: cap_seq[ i, k, 0] = det_indicator[n][i][j][0] + self.vocab_size cap_seq[i, k, 1] = det_indicator[n][i][j][1] cap_seq[i, k, 2] = det_indicator[n][i][j][2] cap_seq[i, k, 3] = self.wtoi[caption[j]] cap_seq[i, k, 4] = self.wtoi[caption[j]] is_det = True j += n # skip the ngram. break if is_det == False: cap_seq[i, k, 0] = self.wtoi[caption[j]] cap_seq[i, k, 4] = cap_seq[i, k, 0] j += 1 k += 1 # get the mask of the ground truth bounding box. The data shape is # num_caption x num_box x num_seq box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length)) for i in range(len(captions)): for j in range(self.seq_length): if cap_seq[i, j, 0] > self.vocab_size: box_mask[i, :, j] = ((gt_bboxs[:, 4] == (cap_seq[i, j, 0] - self.vocab_size)) == 0) # get the batch version of the seq and box_mask. if ncap < self.seq_per_img: seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4]) mask_batch = np.zeros( [self.seq_per_img, gt_bboxs.shape[0], self.seq_length]) # we need to subsample (with replacement) for q in range(self.seq_per_img): ixl = random.randint(0, ncap) seq_batch[q, :] = cap_seq[ixl, :, :4] mask_batch[q, :] = box_mask[ixl] else: ixl = random.randint(0, ncap - self.seq_per_img) seq_batch = cap_seq[ixl:ixl + self.seq_per_img, :, :4] mask_batch = box_mask[ixl:ixl + self.seq_per_img] input_seq = np.zeros([self.seq_per_img, self.seq_length + 1, 4]) input_seq[:, 1:] = seq_batch gt_seq = np.zeros([10, self.seq_length]) gt_seq[:ncap, :] = cap_seq[:, :, 4] # if self.split == 'train': # augment the proposal with the gt bounding box. # this is just to make sure there exist proposals which labels to 1. # gt_bboxs_tmp = np.concatenate((gt_bboxs, np.ones((gt_bboxs.shape[0],1))), axis=1) # proposals = np.concatenate((gt_bboxs_tmp, proposals), axis=0) # flag = False # for cap in captions: # if 'bus' in cap: # flag = True # if flag: # img_show = np.array(img) # img_show2 = copy.deepcopy(img_show) # import cv2 # for i in range(gt_bboxs.shape[0]): # class_name = self.itoc[int(gt_bboxs[i, 4])] # bbox = tuple(int(np.round(x)) for x in gt_bboxs[i, :4]) # cv2.rectangle(img_show, bbox[0:2], bbox[2:4], (0, 204, 0), 2) # cv2.putText(img_show, '%s: %.3f' % (class_name, 1), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN, # 1.0, (0, 0, 255), thickness=1) # cv2.imwrite('gt_boxes.jpg', img_show) # for i in range(proposals.shape[0]): # bbox = tuple(int(np.round(x)) for x in proposals[i, :4]) # score = proposals[i, 5] # class_name = self.itoc[int(proposals[i, 4])] # cv2.rectangle(img_show2, bbox[0:2], bbox[2:4], (0, 204, 0), 2) # cv2.putText(img_show2, '%s: %.3f' % (class_name, score), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN, # 1.0, (0, 0, 255), thickness=1) # cv2.imwrite('proposals.jpg', img_show2) # pdb.set_trace() # padding the proposals and gt_bboxs pad_proposals = np.zeros((self.max_proposal, 6)) pad_gt_bboxs = np.zeros((self.max_gt_box, 5)) pad_box_mask = np.ones( (self.seq_per_img, self.max_gt_box, self.seq_length + 1)) if self.opt.det_oracle == False: num_pps = min(proposals.shape[0], self.max_proposal) num_box = min(gt_bboxs.shape[0], self.max_gt_box) pad_proposals[:num_pps] = proposals[:num_pps] pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :] else: num_pps = min(gt_bboxs.shape[0], self.max_proposal) pad_proposals[:num_pps] = np.concatenate( (gt_bboxs[:num_pps], np.ones([num_pps, 1])), axis=1) num_box = min(gt_bboxs.shape[0], self.max_gt_box) pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :] input_seq = torch.from_numpy(input_seq).long() gt_seq = torch.from_numpy(gt_seq).long() pad_proposals = torch.from_numpy(pad_proposals).float() pad_box_mask = torch.from_numpy(pad_box_mask).byte() pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float() num = torch.FloatTensor([ncap, num_pps, num_box]) if self.opt.cnn_backend == 'vgg16': img = np.array(img, dtype='float32') img = img[:, :, ::-1].copy() # RGB --> BGR img -= self.vgg_pixel_mean img = torch.from_numpy(img) img = img.permute(2, 0, 1).contiguous() else: img = self.ToTensor(img) img = self.res_Normalize(img) return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id
def __getitem__(self, index): ix = self.split_ix[index] # load image here. image_id = self.info['images'][ix]['id'] file_path = self.info['images'][ix]['file_path'] proposal_item =copy.deepcopy(self.dataloader_hdf[ix]) num_proposal = int(proposal_item['dets_num']) num_nms = int(proposal_item['nms_num']) proposals = proposal_item['dets_labels'] proposals = proposals.squeeze()[:num_nms, :] coco_split = file_path.split('/')[0] # get the ground truth bounding box. if coco_split == 'train2014': coco = self.coco_train else: coco = self.coco_val bbox_ann_ids = coco.getAnnIds(imgIds=image_id) bbox_ann = [{'label': self.ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)] gt_bboxs = np.zeros((len(bbox_ann), 5)) for i, bbox in enumerate(bbox_ann): gt_bboxs[i, :4] = bbox['bbox'] gt_bboxs[i, 4] = bbox['label'] # convert from x,y,w,h to x_min, y_min, x_max, y_max gt_bboxs[:,2] = gt_bboxs[:,2] + gt_bboxs[:,0] gt_bboxs[:,3] = gt_bboxs[:,3] + gt_bboxs[:,1] # load the image. img = Image.open(os.path.join(self.opt.image_path, file_path)).convert('RGB') width, height = img.size # resize the image. img = self.Resize(img) if self.split == 'train': # resize the gt_bboxs and proposals. proposals = utils.resize_bbox(proposals, width, height, self.opt.image_size, self.opt.image_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_size, self.opt.image_size) else: proposals = utils.resize_bbox(proposals, width, height, self.opt.image_crop_size, self.opt.image_crop_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_crop_size, self.opt.image_crop_size) # crop the image and the bounding box. img, proposals, gt_bboxs = self.RandomCropWithBbox(img, proposals, gt_bboxs) gt_x = (gt_bboxs[:,2]-gt_bboxs[:,0]+1) gt_y = (gt_bboxs[:,3]-gt_bboxs[:,1]+1) gt_area_nonzero = (((gt_x != 1) & (gt_y != 1))) gt_bboxs = gt_bboxs[gt_area_nonzero] captions = self.caption_file[ix] # given the bbox_ann, and caption, this function determine which word belongs to the detection. det_indicator = self.get_det_word(gt_bboxs, captions) # fetch the captions ncap = len(captions) # number of captions available for this image assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t' # convert caption into sequence label. cap_seq = np.zeros([ncap, self.seq_length, 5]) for i, caption in enumerate(captions): j = 0 k = 0 while j < len(caption) and j < self.seq_length: is_det = False for n in range(2, 0, -1): if det_indicator[n][i][j][0] != 0: cap_seq[i,k,0] = det_indicator[n][i][j][0] + self.vocab_size cap_seq[i,k,1] = det_indicator[n][i][j][1] cap_seq[i,k,2] = det_indicator[n][i][j][2] cap_seq[i,k,3] = self.wtoi[caption[j]] cap_seq[i,k,4] = self.wtoi[caption[j]] is_det = True j += n # skip the ngram. break if is_det == False: cap_seq[i,k,0] = self.wtoi[caption[j]] cap_seq[i,k,4] = cap_seq[i,k,0] j += 1 k += 1 # get the mask of the ground truth bounding box. The data shape is # num_caption x num_box x num_seq box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length)) for i in range(len(captions)): for j in range(self.seq_length): if cap_seq[i,j,0] > self.vocab_size: box_mask[i,:,j] = ((gt_bboxs[:,4] == (cap_seq[i,j,0]-self.vocab_size)) == 0) # get the batch version of the seq and box_mask. if ncap < self.seq_per_img: seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4]) mask_batch = np.zeros([self.seq_per_img, gt_bboxs.shape[0], self.seq_length]) # we need to subsample (with replacement) for q in range(self.seq_per_img): ixl = random.randint(0,ncap) seq_batch[q,:] = cap_seq[ixl,:,:4] mask_batch[q,:]=box_mask[ixl] else: ixl = random.randint(0, ncap - self.seq_per_img) seq_batch = cap_seq[ixl:ixl+self.seq_per_img,:,:4] mask_batch = box_mask[ixl:ixl+self.seq_per_img] input_seq = np.zeros([self.seq_per_img, self.seq_length+1, 4]) input_seq[:,1:] = seq_batch gt_seq = np.zeros([10, self.seq_length]) gt_seq[:ncap,:] = cap_seq[:,:,4] pad_proposals = np.zeros((self.max_proposal, 6)) pad_gt_bboxs = np.zeros((self.max_gt_box, 5)) pad_box_mask = np.ones((self.seq_per_img, self.max_gt_box, self.seq_length+1)) if self.opt.det_oracle == False: num_pps = min(proposals.shape[0], self.max_proposal) num_box = min(gt_bboxs.shape[0], self.max_gt_box) pad_proposals[:num_pps] = proposals[:num_pps] pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:] else: num_pps = min(gt_bboxs.shape[0], self.max_proposal) pad_proposals[:num_pps] = np.concatenate((gt_bboxs[:num_pps], np.ones([num_pps,1])),axis=1) num_box = min(gt_bboxs.shape[0], self.max_gt_box) pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:] input_seq = torch.from_numpy(input_seq).long() gt_seq = torch.from_numpy(gt_seq).long() pad_proposals = torch.from_numpy(pad_proposals).float() pad_box_mask = torch.from_numpy(pad_box_mask).byte() pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float() num = torch.FloatTensor([ncap, num_pps, num_box]) if self.opt.cnn_backend == 'vgg16': img = np.array(img, dtype='float32') img = img[:,:,::-1].copy() # RGB --> BGR img -= self.vgg_pixel_mean img = torch.from_numpy(img) img = img.permute(2, 0, 1).contiguous() else: img = np.array(img, dtype='float32') img = img[:,:,::-1].copy() # RGB --> BGR img /= 255 # Convert range to [0,1] img = self.ToTensor(img) img = self.res_Normalize(img) return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id
def __getitem__(self, index): ix = self.split_ix[index] # load image here. image_id = self.info['images'][ix]['id'] file_path = self.info['images'][ix]['file_path'] # load the proposal file num_nms = int(self.num_nms[ix]) proposals = copy.deepcopy(self.label_proposals[ix]) proposals = proposals[:num_nms, :] region_feature = np.load( os.path.join(self.feature_root, str(image_id) + '.npy')) # filter out low-confidence proposals or backgrounds prop_thresh_mask = (proposals[:, 5] > self.prop_thresh) if self.exclude_bgd_det: non_background_idx = np.nonzero(proposals[:, 4] * prop_thresh_mask)[0] else: non_background_idx = np.nonzero(prop_thresh_mask)[0] proposals = proposals[ non_background_idx, :] # exclude __background__ detections region_feature = region_feature[non_background_idx, :] num_nms = non_background_idx.shape[0] region_feature = region_feature[:num_nms, :] captions = copy.deepcopy(self.caption_file[ix]) bbox_ann = [] bbox_idx = 0 for cap_idx, sent in enumerate(captions): sent['bbox_idx'] = [] for i, box in enumerate(sent['bbox']): if sent['idx'][ i] < self.seq_length: # we don't care about the boxes outside the length limit. # after all our goal is referring, not detection sent['bbox_idx'].append(bbox_idx) bbox_ann.append({'bbox':box, 'label': self.dtoi[sent['clss'][i]], 'bbox_idx':bbox_idx, \ 'idx':sent['idx'][i], 'cap_idx':cap_idx}) bbox_idx += 1 gt_bboxs = np.zeros((len(bbox_ann), 8)) for i, bbox in enumerate(bbox_ann): gt_bboxs[i, :4] = bbox['bbox'] gt_bboxs[i, 4] = bbox['label'] gt_bboxs[i, 5] = bbox['bbox_idx'] gt_bboxs[i, 6] = bbox['idx'] gt_bboxs[i, 7] = bbox['cap_idx'] # load the image. img = Image.open(os.path.join(self.opt.image_path, file_path)).convert('RGB') width, height = img.size # resize the image. img = self.Resize(img) # resize the gt_bboxs and proposals. # ATTENTION, unlike NBT, we do not dynamically crop regions from feature map during training if self.split == 'train': proposals = utils.resize_bbox(proposals, width, height, self.opt.image_size, self.opt.image_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_size, self.opt.image_size) else: proposals = utils.resize_bbox(proposals, width, height, self.opt.image_crop_size, self.opt.image_crop_size) gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_crop_size, self.opt.image_crop_size) img, proposals, gt_bboxs = self.RandomCropWithBbox( img, proposals, gt_bboxs) # random crop img gt_x = (gt_bboxs[:, 2] - gt_bboxs[:, 0] + 1) gt_y = (gt_bboxs[:, 3] - gt_bboxs[:, 1] + 1) gt_area_nonzero = (((gt_x != 1) & (gt_y != 1))) gt_bboxs = gt_bboxs[gt_area_nonzero] # given the bbox_ann, and caption, this function determine which word belongs to the detection. det_indicator = self.get_det_word(gt_bboxs, captions) # fetch the captions ncap = len(captions) # number of captions available for this image assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t' # convert caption into sequence label. cap_seq = np.zeros([ncap, self.seq_length, 5]) for i, caption in enumerate(captions): j = 0 while j < len(caption['caption']) and j < self.seq_length: is_det = False if det_indicator[i][j][0] != 0: cap_seq[i, j, 0] = det_indicator[i][j][0] + self.vocab_size cap_seq[i, j, 1] = det_indicator[i][j][1] cap_seq[i, j, 2] = det_indicator[i][j][2] cap_seq[i, j, 3] = self.wtoi[caption['caption'][j]] cap_seq[i, j, 4] = self.wtoi[caption['caption'][j]] else: cap_seq[i, j, 0] = self.wtoi[caption['caption'][j]] cap_seq[i, j, 4] = self.wtoi[caption['caption'][j]] j += 1 # get the mask of the ground truth bounding box. The data shape is # num_caption x num_box x num_seq box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length)) for i in range(gt_bboxs.shape[0]): box_mask[int(gt_bboxs[i][7]), i, int(gt_bboxs[i][6])] = 0 gt_bboxs = gt_bboxs[:, :5] # get the batch version of the seq and box_mask. if ncap < self.seq_per_img: seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4]) mask_batch = np.zeros( [self.seq_per_img, gt_bboxs.shape[0], self.seq_length]) # we need to subsample (with replacement) for q in range(self.seq_per_img): ixl = random.randint(0, ncap) seq_batch[q, :] = cap_seq[ixl, :, :4] mask_batch[q, :] = box_mask[ixl] else: ixl = random.randint(0, ncap - self.seq_per_img) seq_batch = cap_seq[ixl:ixl + self.seq_per_img, :, :4] mask_batch = box_mask[ixl:ixl + self.seq_per_img] input_seq = np.zeros([self.seq_per_img, self.seq_length + 1, 4]) input_seq[:, 1:] = seq_batch gt_seq = np.zeros([10, self.seq_length]) gt_seq[:ncap, :] = cap_seq[:, :, 4] img_show = np.array(img) # padding the proposals and gt_bboxs pad_proposals = np.zeros((self.max_proposal, 6)) pad_gt_bboxs = np.zeros((self.max_gt_box, 5)) pad_box_mask = np.ones( (self.seq_per_img, self.max_gt_box, self.seq_length + 1)) pad_region_feature = np.zeros((self.max_proposal, self.att_feat_size)) if self.aug_gt_det: proposals[:, 4] += self.glove_clss.shape[0] if self.split == 'train': # augment the proposal with the gt bounding box. confident score is 1 gt_bboxs_tmp = np.concatenate( (gt_bboxs, np.ones((gt_bboxs.shape[0], 1))), axis=1) proposals = np.concatenate((gt_bboxs_tmp, proposals), axis=0) num_pps = min(proposals.shape[0], self.max_proposal) num_box = min(gt_bboxs.shape[0], self.max_gt_box) pad_proposals[:num_pps] = proposals[:num_pps] pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :] pad_region_feature[:num_pps] = region_feature[:num_pps] input_seq = torch.from_numpy(input_seq).long() gt_seq = torch.from_numpy(gt_seq).long() pad_proposals = torch.from_numpy(pad_proposals).float() pad_box_mask = torch.from_numpy(pad_box_mask).byte() pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float() pad_region_feature = torch.from_numpy(pad_region_feature).float() num = torch.FloatTensor([ncap, num_pps, num_box, width, height]) if self.opt.cnn_backend == 'vgg16': img = np.array(img, dtype='float32') img = img[:, :, ::-1].copy() # RGB --> BGR img -= self.vgg_pixel_mean img = torch.from_numpy(img) img = img.permute(2, 0, 1).contiguous() else: img = self.ToTensor(img) img = self.res_Normalize(img) if self.vis_attn: return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id, img_show, pad_region_feature else: return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id, pad_region_feature