def __getitem__(self, index):

        ix = self.split_ix[index]

        # load image here.
        image_id = self.info['images'][ix]['id']
        file_path = self.info['images'][ix]['file_path']

        # load the proposal file
        # proposal_file = self.proposal_file[image_id]
        num_proposal = int(self.num_proposals[ix])
        num_nms = int(self.num_nms[ix])
        proposals = self.label_proposals[ix]
        proposals = proposals[:num_nms, :]

        coco_split = file_path.split('/')[0]
        # get the ground truth bounding box.
        if coco_split == 'train2014':
            coco = self.coco_train
        else:
            coco = self.coco_val

        bbox_ann_ids = coco.getAnnIds(imgIds=image_id)
        bbox_ann = [{
            'label': self.ctol[i['category_id']],
            'bbox': i['bbox']
        } for i in coco.loadAnns(bbox_ann_ids)]

        gt_bboxs = np.zeros((len(bbox_ann), 5))
        for i, bbox in enumerate(bbox_ann):
            gt_bboxs[i, :4] = bbox['bbox']
            gt_bboxs[i, 4] = bbox['label']

        # convert from x,y,w,h to x_min, y_min, x_max, y_max
        gt_bboxs[:, 2] = gt_bboxs[:, 2] + gt_bboxs[:, 0]
        gt_bboxs[:, 3] = gt_bboxs[:, 3] + gt_bboxs[:, 1]

        # load the image.
        img = Image.open(os.path.join(self.opt.image_path,
                                      file_path)).convert('RGB')

        width, height = img.size
        # resize the image.
        img = self.Resize(img)

        if self.split == 'train':
            # resize the gt_bboxs and proposals.
            proposals = utils.resize_bbox(proposals, width, height,
                                          self.opt.image_size,
                                          self.opt.image_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height,
                                         self.opt.image_size,
                                         self.opt.image_size)
        else:
            proposals = utils.resize_bbox(proposals, width, height,
                                          self.opt.image_crop_size,
                                          self.opt.image_crop_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height,
                                         self.opt.image_crop_size,
                                         self.opt.image_crop_size)

        # crop the image and the bounding box.
        img, proposals, gt_bboxs = self.RandomCropWithBbox(
            img, proposals, gt_bboxs)

        gt_x = (gt_bboxs[:, 2] - gt_bboxs[:, 0] + 1)
        gt_y = (gt_bboxs[:, 3] - gt_bboxs[:, 1] + 1)
        gt_area_nonzero = (((gt_x != 1) & (gt_y != 1)))

        gt_bboxs = gt_bboxs[gt_area_nonzero]
        captions = self.caption_file[ix]

        # given the bbox_ann, and caption, this function determine which word belongs to the detection.
        det_indicator = self.get_det_word(gt_bboxs, captions)

        # fetch the captions
        ncap = len(captions)  # number of captions available for this image
        assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'

        # convert caption into sequence label.
        cap_seq = np.zeros([ncap, self.seq_length, 5])
        for i, caption in enumerate(captions):
            j = 0
            k = 0
            while j < len(caption) and j < self.seq_length:
                is_det = False
                for n in range(
                        2, 0, -1
                ):  # the object label at most has two words, so 2-gram
                    if det_indicator[n][i][j][0] != 0:
                        cap_seq[
                            i, k,
                            0] = det_indicator[n][i][j][0] + self.vocab_size
                        cap_seq[i, k, 1] = det_indicator[n][i][j][1]
                        cap_seq[i, k, 2] = det_indicator[n][i][j][2]
                        cap_seq[i, k, 3] = self.wtoi[caption[j]]
                        cap_seq[i, k, 4] = self.wtoi[caption[j]]

                        is_det = True
                        j += n  # skip the ngram.
                        break
                if is_det == False:
                    cap_seq[i, k, 0] = self.wtoi[caption[j]]
                    cap_seq[i, k, 4] = cap_seq[i, k, 0]
                    j += 1
                k += 1

        # get the mask of the ground truth bounding box. The data shape is
        # num_caption x num_box x num_seq
        box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length))
        for i in range(len(captions)):
            for j in range(self.seq_length):
                if cap_seq[i, j, 0] > self.vocab_size:
                    box_mask[i, :,
                             j] = ((gt_bboxs[:, 4] == (cap_seq[i, j, 0] -
                                                       self.vocab_size)) == 0)

        # get the batch version of the seq and box_mask.
        if ncap < self.seq_per_img:
            seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4])
            mask_batch = np.zeros(
                [self.seq_per_img, gt_bboxs.shape[0], self.seq_length])
            # we need to subsample (with replacement)
            for q in range(self.seq_per_img):
                ixl = random.randint(0, ncap)
                seq_batch[q, :] = cap_seq[ixl, :, :4]
                mask_batch[q, :] = box_mask[ixl]
        else:
            ixl = random.randint(0, ncap - self.seq_per_img)
            seq_batch = cap_seq[ixl:ixl + self.seq_per_img, :, :4]
            mask_batch = box_mask[ixl:ixl + self.seq_per_img]

        input_seq = np.zeros([self.seq_per_img, self.seq_length + 1, 4])
        input_seq[:, 1:] = seq_batch

        gt_seq = np.zeros([10, self.seq_length])
        gt_seq[:ncap, :] = cap_seq[:, :, 4]

        # if self.split == 'train':
        # augment the proposal with the gt bounding box.
        # this is just to make sure there exist proposals which labels to 1.
        # gt_bboxs_tmp = np.concatenate((gt_bboxs, np.ones((gt_bboxs.shape[0],1))), axis=1)
        # proposals = np.concatenate((gt_bboxs_tmp, proposals), axis=0)
        # flag = False
        # for cap in captions:
        #     if 'bus' in cap:
        #         flag = True
        # if flag:
        #     img_show = np.array(img)
        #     img_show2 = copy.deepcopy(img_show)
        #     import cv2
        #     for i in range(gt_bboxs.shape[0]):
        #         class_name = self.itoc[int(gt_bboxs[i, 4])]
        #         bbox = tuple(int(np.round(x)) for x in gt_bboxs[i, :4])
        #         cv2.rectangle(img_show, bbox[0:2], bbox[2:4], (0, 204, 0), 2)
        #         cv2.putText(img_show, '%s: %.3f' % (class_name, 1), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN,
        #                     1.0, (0, 0, 255), thickness=1)
        #     cv2.imwrite('gt_boxes.jpg', img_show)

        #     for i in range(proposals.shape[0]):
        #         bbox = tuple(int(np.round(x)) for x in proposals[i, :4])
        #         score =  proposals[i, 5]
        #         class_name = self.itoc[int(proposals[i, 4])]
        #         cv2.rectangle(img_show2, bbox[0:2], bbox[2:4], (0, 204, 0), 2)

        #         cv2.putText(img_show2, '%s: %.3f' % (class_name, score), (bbox[0], bbox[1] + 15), cv2.FONT_HERSHEY_PLAIN,
        #                     1.0, (0, 0, 255), thickness=1)
        #     cv2.imwrite('proposals.jpg', img_show2)

        #     pdb.set_trace()
        # padding the proposals and gt_bboxs
        pad_proposals = np.zeros((self.max_proposal, 6))
        pad_gt_bboxs = np.zeros((self.max_gt_box, 5))
        pad_box_mask = np.ones(
            (self.seq_per_img, self.max_gt_box, self.seq_length + 1))

        if self.opt.det_oracle == False:
            num_pps = min(proposals.shape[0], self.max_proposal)
            num_box = min(gt_bboxs.shape[0], self.max_gt_box)

            pad_proposals[:num_pps] = proposals[:num_pps]
            pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
            pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :]
        else:
            num_pps = min(gt_bboxs.shape[0], self.max_proposal)
            pad_proposals[:num_pps] = np.concatenate(
                (gt_bboxs[:num_pps], np.ones([num_pps, 1])), axis=1)
            num_box = min(gt_bboxs.shape[0], self.max_gt_box)
            pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
            pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :]

        input_seq = torch.from_numpy(input_seq).long()
        gt_seq = torch.from_numpy(gt_seq).long()
        pad_proposals = torch.from_numpy(pad_proposals).float()
        pad_box_mask = torch.from_numpy(pad_box_mask).byte()
        pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float()
        num = torch.FloatTensor([ncap, num_pps, num_box])

        if self.opt.cnn_backend == 'vgg16':
            img = np.array(img, dtype='float32')
            img = img[:, :, ::-1].copy()  # RGB --> BGR
            img -= self.vgg_pixel_mean
            img = torch.from_numpy(img)
            img = img.permute(2, 0, 1).contiguous()
        else:
            img = self.ToTensor(img)
            img = self.res_Normalize(img)

        return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id
    def __getitem__(self, index):
        
        ix = self.split_ix[index]

        # load image here.
        image_id = self.info['images'][ix]['id']
        file_path = self.info['images'][ix]['file_path']

        proposal_item =copy.deepcopy(self.dataloader_hdf[ix])
        num_proposal = int(proposal_item['dets_num'])
        num_nms = int(proposal_item['nms_num'])
        proposals = proposal_item['dets_labels']
        proposals = proposals.squeeze()[:num_nms, :]

        coco_split = file_path.split('/')[0]
        # get the ground truth bounding box.
        if coco_split == 'train2014':
            coco = self.coco_train
        else:
            coco = self.coco_val

        bbox_ann_ids = coco.getAnnIds(imgIds=image_id)
        bbox_ann = [{'label': self.ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)]

        gt_bboxs = np.zeros((len(bbox_ann), 5))
        for i, bbox in enumerate(bbox_ann):
            gt_bboxs[i, :4] = bbox['bbox']
            gt_bboxs[i, 4] = bbox['label']

        # convert from x,y,w,h to x_min, y_min, x_max, y_max
        gt_bboxs[:,2] = gt_bboxs[:,2] + gt_bboxs[:,0]
        gt_bboxs[:,3] = gt_bboxs[:,3] + gt_bboxs[:,1]

        # load the image.
        img = Image.open(os.path.join(self.opt.image_path, file_path)).convert('RGB')

        width, height = img.size
        # resize the image.
        img = self.Resize(img)

        if self.split == 'train':
            # resize the gt_bboxs and proposals.
            proposals = utils.resize_bbox(proposals, width, height, self.opt.image_size, self.opt.image_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_size, self.opt.image_size)
        else:
            proposals = utils.resize_bbox(proposals, width, height, self.opt.image_crop_size, self.opt.image_crop_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_crop_size, self.opt.image_crop_size)

        # crop the image and the bounding box.
        img, proposals, gt_bboxs = self.RandomCropWithBbox(img, proposals, gt_bboxs)

        gt_x = (gt_bboxs[:,2]-gt_bboxs[:,0]+1)
        gt_y = (gt_bboxs[:,3]-gt_bboxs[:,1]+1)
        gt_area_nonzero = (((gt_x != 1) & (gt_y != 1)))

        gt_bboxs = gt_bboxs[gt_area_nonzero]
        captions = self.caption_file[ix]

        # given the bbox_ann, and caption, this function determine which word belongs to the detection.
        det_indicator = self.get_det_word(gt_bboxs, captions)

        # fetch the captions
        ncap = len(captions) # number of captions available for this image
        assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'

        # convert caption into sequence label.
        cap_seq = np.zeros([ncap, self.seq_length, 5])
        for i, caption in enumerate(captions):
            j = 0
            k = 0
            while j < len(caption) and j < self.seq_length:
                is_det = False
                for n in range(2, 0, -1):
                    if det_indicator[n][i][j][0] != 0:
                        cap_seq[i,k,0] = det_indicator[n][i][j][0] + self.vocab_size
                        cap_seq[i,k,1] = det_indicator[n][i][j][1]
                        cap_seq[i,k,2] = det_indicator[n][i][j][2]
                        cap_seq[i,k,3] = self.wtoi[caption[j]]
                        cap_seq[i,k,4] = self.wtoi[caption[j]]

                        is_det = True
                        j += n # skip the ngram.
                        break
                if is_det == False:
                    cap_seq[i,k,0] = self.wtoi[caption[j]]
                    cap_seq[i,k,4] = cap_seq[i,k,0]
                    j += 1
                k += 1

        # get the mask of the ground truth bounding box. The data shape is
        # num_caption x num_box x num_seq
        box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length))
        for i in range(len(captions)):
            for j in range(self.seq_length):
                if cap_seq[i,j,0] > self.vocab_size:
                    box_mask[i,:,j] = ((gt_bboxs[:,4] == (cap_seq[i,j,0]-self.vocab_size)) == 0)

        # get the batch version of the seq and box_mask.
        if ncap < self.seq_per_img:
            seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4])
            mask_batch = np.zeros([self.seq_per_img, gt_bboxs.shape[0], self.seq_length])
            # we need to subsample (with replacement)
            for q in range(self.seq_per_img):
                ixl = random.randint(0,ncap)
                seq_batch[q,:] = cap_seq[ixl,:,:4]
                mask_batch[q,:]=box_mask[ixl]
        else:
            ixl = random.randint(0, ncap - self.seq_per_img)
            seq_batch = cap_seq[ixl:ixl+self.seq_per_img,:,:4]
            mask_batch = box_mask[ixl:ixl+self.seq_per_img]

        input_seq = np.zeros([self.seq_per_img, self.seq_length+1, 4])
        input_seq[:,1:] = seq_batch

        gt_seq = np.zeros([10, self.seq_length])
        gt_seq[:ncap,:] = cap_seq[:,:,4]
        pad_proposals = np.zeros((self.max_proposal, 6))
        pad_gt_bboxs = np.zeros((self.max_gt_box, 5))
        pad_box_mask = np.ones((self.seq_per_img, self.max_gt_box, self.seq_length+1))

        if self.opt.det_oracle == False:
            num_pps = min(proposals.shape[0], self.max_proposal)
            num_box = min(gt_bboxs.shape[0], self.max_gt_box)
            pad_proposals[:num_pps] = proposals[:num_pps]
            pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
            pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:]
        else:
            num_pps = min(gt_bboxs.shape[0], self.max_proposal)
            pad_proposals[:num_pps] = np.concatenate((gt_bboxs[:num_pps], np.ones([num_pps,1])),axis=1)
            num_box = min(gt_bboxs.shape[0], self.max_gt_box)
            pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
            pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:]

        input_seq = torch.from_numpy(input_seq).long()
        gt_seq = torch.from_numpy(gt_seq).long()
        pad_proposals = torch.from_numpy(pad_proposals).float()
        pad_box_mask = torch.from_numpy(pad_box_mask).byte()
        pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float()
        num = torch.FloatTensor([ncap, num_pps, num_box])

        if self.opt.cnn_backend == 'vgg16':
            img = np.array(img, dtype='float32')
            img = img[:,:,::-1].copy() # RGB --> BGR
            img -= self.vgg_pixel_mean
            img = torch.from_numpy(img)
            img = img.permute(2, 0, 1).contiguous()
        else:
            img = np.array(img, dtype='float32')
            img = img[:,:,::-1].copy() # RGB --> BGR
            img /= 255 # Convert range to [0,1]
            img = self.ToTensor(img)
            img = self.res_Normalize(img)

        return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id
Exemple #3
0
    def __getitem__(self, index):

        ix = self.split_ix[index]

        # load image here.
        image_id = self.info['images'][ix]['id']
        file_path = self.info['images'][ix]['file_path']

        # load the proposal file
        num_nms = int(self.num_nms[ix])
        proposals = copy.deepcopy(self.label_proposals[ix])
        proposals = proposals[:num_nms, :]

        region_feature = np.load(
            os.path.join(self.feature_root,
                         str(image_id) + '.npy'))

        # filter out low-confidence proposals or backgrounds
        prop_thresh_mask = (proposals[:, 5] > self.prop_thresh)
        if self.exclude_bgd_det:
            non_background_idx = np.nonzero(proposals[:, 4] *
                                            prop_thresh_mask)[0]
        else:
            non_background_idx = np.nonzero(prop_thresh_mask)[0]
        proposals = proposals[
            non_background_idx, :]  # exclude __background__ detections
        region_feature = region_feature[non_background_idx, :]
        num_nms = non_background_idx.shape[0]

        region_feature = region_feature[:num_nms, :]
        captions = copy.deepcopy(self.caption_file[ix])

        bbox_ann = []
        bbox_idx = 0
        for cap_idx, sent in enumerate(captions):
            sent['bbox_idx'] = []
            for i, box in enumerate(sent['bbox']):
                if sent['idx'][
                        i] < self.seq_length:  # we don't care about the boxes outside the length limit.
                    # after all our goal is referring, not detection
                    sent['bbox_idx'].append(bbox_idx)
                    bbox_ann.append({'bbox':box, 'label': self.dtoi[sent['clss'][i]], 'bbox_idx':bbox_idx, \
                        'idx':sent['idx'][i], 'cap_idx':cap_idx})
                    bbox_idx += 1

        gt_bboxs = np.zeros((len(bbox_ann), 8))
        for i, bbox in enumerate(bbox_ann):
            gt_bboxs[i, :4] = bbox['bbox']
            gt_bboxs[i, 4] = bbox['label']
            gt_bboxs[i, 5] = bbox['bbox_idx']
            gt_bboxs[i, 6] = bbox['idx']
            gt_bboxs[i, 7] = bbox['cap_idx']

        # load the image.
        img = Image.open(os.path.join(self.opt.image_path,
                                      file_path)).convert('RGB')
        width, height = img.size

        # resize the image.
        img = self.Resize(img)

        # resize the gt_bboxs and proposals.
        # ATTENTION, unlike NBT, we do not dynamically crop regions from feature map during training
        if self.split == 'train':
            proposals = utils.resize_bbox(proposals, width, height,
                                          self.opt.image_size,
                                          self.opt.image_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height,
                                         self.opt.image_size,
                                         self.opt.image_size)
        else:
            proposals = utils.resize_bbox(proposals, width, height,
                                          self.opt.image_crop_size,
                                          self.opt.image_crop_size)
            gt_bboxs = utils.resize_bbox(gt_bboxs, width, height,
                                         self.opt.image_crop_size,
                                         self.opt.image_crop_size)

        img, proposals, gt_bboxs = self.RandomCropWithBbox(
            img, proposals, gt_bboxs)  # random crop img

        gt_x = (gt_bboxs[:, 2] - gt_bboxs[:, 0] + 1)
        gt_y = (gt_bboxs[:, 3] - gt_bboxs[:, 1] + 1)
        gt_area_nonzero = (((gt_x != 1) & (gt_y != 1)))

        gt_bboxs = gt_bboxs[gt_area_nonzero]

        # given the bbox_ann, and caption, this function determine which word belongs to the detection.
        det_indicator = self.get_det_word(gt_bboxs, captions)
        # fetch the captions
        ncap = len(captions)  # number of captions available for this image
        assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'

        # convert caption into sequence label.
        cap_seq = np.zeros([ncap, self.seq_length, 5])
        for i, caption in enumerate(captions):
            j = 0
            while j < len(caption['caption']) and j < self.seq_length:
                is_det = False
                if det_indicator[i][j][0] != 0:
                    cap_seq[i, j, 0] = det_indicator[i][j][0] + self.vocab_size
                    cap_seq[i, j, 1] = det_indicator[i][j][1]
                    cap_seq[i, j, 2] = det_indicator[i][j][2]
                    cap_seq[i, j, 3] = self.wtoi[caption['caption'][j]]
                    cap_seq[i, j, 4] = self.wtoi[caption['caption'][j]]
                else:
                    cap_seq[i, j, 0] = self.wtoi[caption['caption'][j]]
                    cap_seq[i, j, 4] = self.wtoi[caption['caption'][j]]
                j += 1

        # get the mask of the ground truth bounding box. The data shape is
        # num_caption x num_box x num_seq
        box_mask = np.ones((len(captions), gt_bboxs.shape[0], self.seq_length))
        for i in range(gt_bboxs.shape[0]):
            box_mask[int(gt_bboxs[i][7]), i, int(gt_bboxs[i][6])] = 0

        gt_bboxs = gt_bboxs[:, :5]

        # get the batch version of the seq and box_mask.
        if ncap < self.seq_per_img:
            seq_batch = np.zeros([self.seq_per_img, self.seq_length, 4])
            mask_batch = np.zeros(
                [self.seq_per_img, gt_bboxs.shape[0], self.seq_length])
            # we need to subsample (with replacement)
            for q in range(self.seq_per_img):
                ixl = random.randint(0, ncap)
                seq_batch[q, :] = cap_seq[ixl, :, :4]
                mask_batch[q, :] = box_mask[ixl]
        else:
            ixl = random.randint(0, ncap - self.seq_per_img)
            seq_batch = cap_seq[ixl:ixl + self.seq_per_img, :, :4]
            mask_batch = box_mask[ixl:ixl + self.seq_per_img]

        input_seq = np.zeros([self.seq_per_img, self.seq_length + 1, 4])
        input_seq[:, 1:] = seq_batch

        gt_seq = np.zeros([10, self.seq_length])
        gt_seq[:ncap, :] = cap_seq[:, :, 4]

        img_show = np.array(img)

        # padding the proposals and gt_bboxs
        pad_proposals = np.zeros((self.max_proposal, 6))
        pad_gt_bboxs = np.zeros((self.max_gt_box, 5))
        pad_box_mask = np.ones(
            (self.seq_per_img, self.max_gt_box, self.seq_length + 1))
        pad_region_feature = np.zeros((self.max_proposal, self.att_feat_size))

        if self.aug_gt_det:
            proposals[:, 4] += self.glove_clss.shape[0]
            if self.split == 'train':
                # augment the proposal with the gt bounding box. confident score is 1
                gt_bboxs_tmp = np.concatenate(
                    (gt_bboxs, np.ones((gt_bboxs.shape[0], 1))), axis=1)
                proposals = np.concatenate((gt_bboxs_tmp, proposals), axis=0)

        num_pps = min(proposals.shape[0], self.max_proposal)
        num_box = min(gt_bboxs.shape[0], self.max_gt_box)
        pad_proposals[:num_pps] = proposals[:num_pps]
        pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
        pad_box_mask[:, :num_box, 1:] = mask_batch[:, :num_box, :]
        pad_region_feature[:num_pps] = region_feature[:num_pps]

        input_seq = torch.from_numpy(input_seq).long()
        gt_seq = torch.from_numpy(gt_seq).long()
        pad_proposals = torch.from_numpy(pad_proposals).float()
        pad_box_mask = torch.from_numpy(pad_box_mask).byte()
        pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float()
        pad_region_feature = torch.from_numpy(pad_region_feature).float()
        num = torch.FloatTensor([ncap, num_pps, num_box, width, height])

        if self.opt.cnn_backend == 'vgg16':
            img = np.array(img, dtype='float32')
            img = img[:, :, ::-1].copy()  # RGB --> BGR
            img -= self.vgg_pixel_mean
            img = torch.from_numpy(img)
            img = img.permute(2, 0, 1).contiguous()
        else:
            img = self.ToTensor(img)
            img = self.res_Normalize(img)

        if self.vis_attn:
            return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id, img_show, pad_region_feature
        else:
            return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id, pad_region_feature