def __getitem__(self, index): # Sample random scales to use for each image in this batch item = {'rpn_targets': {}} target_scale = self.opts[self.cfg_key]['SCALES'][npr.randint(0, high=len(self.opts[self.cfg_key]['SCALES']))] img = cv2.imread(osp.join(self._data_path, self.annotations[index]['path'])) img_original_shape = img.shape item['path']= self.annotations[index]['path'] img, im_scale = self._image_resize(img, target_scale, self.opts[self.cfg_key]['MAX_SIZE']) # restore the [image_height, image_width, scale_factor, max_size] item['image_info'] = np.array([img.shape[0], img.shape[1], im_scale, img_original_shape[0], img_original_shape[1]], dtype=np.float) item['visual'] = Image.fromarray(img) if self.transform is not None: item['visual'] = self.transform(item['visual']) # if self._batch_size > 1: # # padding the image to MAX_SIZE, so all images can be stacked # pad_h = self.opts[self.cfg_key]['MAX_SIZE'] - item['visual'].size(1) # pad_w = self.opts[self.cfg_key]['MAX_SIZE'] - item['visual'].size(2) # item['visual'] = F.pad(item['visual'], (0, pad_w, 0, pad_h)).data _annotation = self.annotations[index] gt_boxes_object = np.zeros((len(_annotation['objects']), 5)) gt_boxes_object[:, 0:4] = np.array([obj['box'] for obj in _annotation['objects']], dtype=np.float) * im_scale gt_boxes_object[:, 4] = np.array([obj['class'] for obj in _annotation['objects']]) item['objects'] = gt_boxes_object if self._image_set == 'train': # calculate the RPN target item['rpn_targets']['object'] = anchor_target_layer(item['visual'], gt_boxes_object, item['image_info'], self._feat_stride, self._rpn_opts['object'], mappings = self._rpn_opts['mappings']) gt_relationships = np.zeros([len(_annotation['objects']), (len(_annotation['objects']))], dtype=np.long) for rel in _annotation['relationships']: gt_relationships[rel['sub_id'], rel['obj_id']] = rel['predicate'] item['relations'] = gt_relationships if self.use_region: gt_boxes_region = np.zeros((len(_annotation['regions']), self.max_size + 4)) # 4 for box and 40 for sentences gt_boxes_region[:, 0:4] = np.array([reg['box'] for reg in _annotation['regions']], dtype=np.float) * im_scale gt_boxes_region[:, 4:] = np.array([np.pad(reg['phrase'], (0,self.max_size-len(reg['phrase'])),'constant',constant_values=self.voc_sign['end']) for reg in _annotation['regions']]) item['regions'] = gt_boxes_region if self._image_set == 'train' and 'region' in self._rpn_opts.keys(): # calculate the RPN target item['rpn_targets']['region'] = anchor_target_layer(item['visual'], gt_boxes_region, item['image_info'], self._feat_stride, self._rpn_opts['region'], mappings = self._rpn_opts['mappings']) else: item['regions'] = None return item
def __getitem__(self, index): ''' item数据结构: rpn_targets: path: 标注所在路径 image_info: 图片信息(image_height, image_width, scale_factor, max_size) visual: 从array转换为Image的图片 objects: 图片中的object的bbox和class(GroundTruth) relatonship: 图片中的relationship(GroundTruth) ''' # Sample random scales to use for each image in this batch # item是每张图片load进来的数据结构 item = {'rpn_targets': {}} target_scale = self.opts[self.cfg_key]['SCALES'][npr.randint( 0, high=len(self.opts[self.cfg_key]['SCALES']))] img = cv2.imread( osp.join(self._data_path, self.annotations[index]['path'])) img_original_shape = img.shape item['path'] = self.annotations[index]['path'] # 缩放图片到小于option中的最大值 img, im_scale = self._image_resize(img, target_scale, self.opts[self.cfg_key]['MAX_SIZE']) # restore the [image_height, image_width, scale_factor, max_size] item['image_info'] = np.array([ img.shape[0], img.shape[1], im_scale, img_original_shape[0], img_original_shape[1] ], dtype=np.float) # 将矩阵转换为Image item['visual'] = Image.fromarray(img) # 图像归一化? if self.transform is not None: item['visual'] = self.transform(item['visual']) # if self._batch_size > 1: # # padding the image to MAX_SIZE, so all images can be stacked # pad_h = self.opts[self.cfg_key]['MAX_SIZE'] - item['visual'].size(1) # pad_w = self.opts[self.cfg_key]['MAX_SIZE'] - item['visual'].size(2) # item['visual'] = F.pad(item['visual'], (0, pad_w, 0, pad_h)).data _annotation = self.annotations[index] # 从_annotation['objects']中读入object的ground truth(bbox和class) gt_boxes_object = np.zeros((len(_annotation['objects']), 5)) # box的坐标 gt_boxes_object[:, 0:4] = np.array( [obj['bbox'] for obj in _annotation['objects']], dtype=np.float) * im_scale # box对应的class gt_boxes_object[:, 4] = np.array( [obj['class'] for obj in _annotation['objects']]) item['objects'] = gt_boxes_object if self.cfg_key == 'train': # calculate the RPN target item['rpn_targets']['object'] = anchor_target_layer( item['visual'], gt_boxes_object, item['image_info'], self._feat_stride, self._rpn_opts['object'], mappings=self._rpn_opts['mappings']) # 获取relationship(predicate)的ground truth gt_relationships = np.zeros( [len(_annotation['objects']), (len(_annotation['objects']))], dtype=np.long) for rel in _annotation['relationships']: gt_relationships[rel['sub_id'], rel['obj_id']] = rel['predicate'] item['relations'] = gt_relationships return item