def _get_image_blob(roidb, scale_inds): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] im_scales = [] for i in range(num_images): #im = cv2.imread(roidb[i]['image']) im = imread(roidb[i]['image']) if len(im.shape) == 2: im = im[:, :, np.newaxis] im = np.concatenate((im, im, im), axis=2) # flip the channel, since the original one using cv2 # rgb -> bgr im = im[:, :, ::-1] if roidb[i]['flipped']: im = im[:, ::-1, :] target_size = cfg.TRAIN.SCALES[scale_inds[i]] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) im_scales.append(im_scale) processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, im_scales
def roi_data(image, target_size=240): """ Prepare the input of faster rcnn for detecting objects """ # flip the channel, since the original one using cv2 # rgb -> bgr # image = image[:, :, ::-1] # Pixel mean values (BGR order) as a (1, 1, 3) array # We use the same pixel mean for all networks even though it's not exactly what # they were trained with pixel_means = np.array([[[102.9801, 115.9465, 122.7717]]]) image, im_scale = prep_im_for_blob(im=image, pixel_means=pixel_means, target_size=target_size, max_size=0, normalize=None) im_info = np.array([image.shape[0], image.shape[1], im_scale], dtype=np.float32) # numpy to tensor image = torch.Tensor(image.astype(np.float32)) image = image.permute(2, 0, 1) im_info = torch.Tensor(im_info) gt_boxes = torch.ones(5, dtype=torch.float32) num_boxes = torch.LongTensor([0]) image.requires_grad = False im_info.requires_grad = False gt_boxes.requires_grad = False num_boxes.requires_grad = False return image, im_info, gt_boxes, num_boxes
def _get_image_blob(roidb, scale_inds): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) # num_images = 1 processed_ims = [] im_scales = [] for i in range(num_images): im = imread(roidb[i]['image']) if len(im.shape) == 2: im = im[:, :, np.newaxis] im = np.concatenate((im, im, im), axis=2) # flip the channel, since the original one using cv2 # rgb -> bgr im = im[:, :, ::-1] if roidb[i]['flipped']: im = im[:, ::-1, :] # 对图像进行水平翻转 target_size = cfg.TRAIN.SCALES[scale_inds[i]] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) # im_scale = (target_size) / float(im_size_min),表示原始图像的短边到训练尺寸600的变换倍数 im_scales.append(im_scale) processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) # 返回blob形式[1,w,h,c],im_scales表示图像resize的倍数 return blob, im_scales
def get_evaluate_batch(self, im_path, index): # Sample random scales to use for each image in this batch # Get the input image blob, formatted for caffe # im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) im = imread(im_path) if len(im.shape) == 2: im = im[:, :, np.newaxis] im = np.concatenate((im, im, im), axis=2) # flip the channel, since the original one using cv2 # rgb -> bgr im = im[:, :, ::-1] target_size = cfg.TRAIN.SCALES[0] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) im_blob = im_list_to_blob([im]) blobs = {'data': im_blob} # gt boxes: (x1, y1, x2, y2, cls) gt_boxes = np.empty((0, 5), dtype=np.float32) blobs['gt_boxes'] = gt_boxes blobs['im_info'] = np.array([[im.shape[0], im.shape[1], im_scale]], dtype=np.float32) blobs['img_id'] = index return blobs
def prepare_im_func(prefix, random_idx, frame_idx, flipped): frame_path = os.path.join(prefix, 'image_' + str(frame_idx).zfill(5) + '.jpg') frame = cv2.imread(frame_path) # process the boundary frame if frame is None: frames = sorted(os.listdir(prefix)) frame_path = frame_path = os.path.join(prefix, frames[-1]) frame = cv2.imread(frame_path) frame = prep_im_for_blob(frame, cfg.PIXEL_MEANS, tuple(cfg.TRAIN.FRAME_SIZE[::-1]), cfg.TRAIN.CROP_SIZE, random_idx) if flipped: frame = frame[:, ::-1, :] if DEBUG: cv2.imshow('frame', frame / 255.0) cv2.waitKey(0) cv2.destroyAllWindows() return frame
def _get_video_blob( roidb, scale_inds, phase='train' ): # ([{'gt_classes': array([18.]), 'bg_name':... }], [0], 'train') """Builds an input blob from the videos in the roidb at the specified scales. """ processed_videos = [] for i, item in enumerate( roidb ): # i= 0,item的shape: {'gt_classes': array([18.]), 'bg_name':... } # just one scale implementated video_length = cfg.TRAIN.LENGTH[scale_inds[0]] # video_length = 512 video = np.zeros(( video_length, cfg.TRAIN.CROP_SIZE, # (512, 112, 112, 3) cfg.TRAIN.CROP_SIZE, 3)) j = 0 if phase == 'train': random_idx = [ np.random.randint( cfg.TRAIN.FRAME_SIZE[1] - cfg.TRAIN.CROP_SIZE ), # [np.random.randint(59), np.random.randint(16)] np.random.randint(cfg.TRAIN.FRAME_SIZE[0] - cfg.TRAIN.CROP_SIZE) ] # TODO: data argumentation #image_w, image_h, crop_w, crop_h = cfg.TRAIN.FRAME_SIZE[1], cfg.TRAIN.FRAME_SIZE[0], cfg.TRAIN.CROP_SIZE, cfg.TRAIN.CROP_SIZE #offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h) #random_idx = offsets[ npr.choice(len(offsets)) ] else: random_idx = [ int((cfg.TRAIN.FRAME_SIZE[1] - cfg.TRAIN.CROP_SIZE) / 2), int((cfg.TRAIN.FRAME_SIZE[0] - cfg.TRAIN.CROP_SIZE) / 2) ] if DEBUG: print("offsets: {}, random_idx: {}".format(offsets, random_idx)) video_info = item['frames'][ 0] # item['frames'][0]的shape:[0,1317,2085,1] step = video_info[3] if cfg.INPUT == 'video' else 1 # step = 1 prefix = item['fg_name'] if video_info[0] else item[ 'bg_name'] # 视频帧文件夹的绝对路径 if cfg.TEMP_SPARSE_SAMPLING: if phase == 'train': segment_offsets = npr.randint( step, size=len(range(video_info[1], video_info[2], step))) else: segment_offsets = np.zeros( len(range(video_info[1], video_info[2], step))) + step // 2 else: # 走这条路 segment_offsets = np.zeros( len(range(video_info[1], video_info[2], step))) # 时序片段的帧长度 for i, idx in enumerate(range(video_info[1], video_info[2], step)): # idx为该时序片段内的所有帧的下标 frame_idx = int(segment_offsets[i] + idx + 1) # frame_idx为该时序片段内的所有帧的下标+1 frame_path = os.path.join(prefix, 'image_' + str(frame_idx).zfill(5) + '.jpg') # frame_path为该时序片段内的所有帧的绝对路径 frame = cv2.imread(frame_path) # 读取该路径下帧的图片 # process the boundary frame if frame is None: # 若该时序片段下的帧为空,则读取该视频帧文件夹的最后一帧作为frame frames = sorted(os.listdir( prefix)) # frames为帧文件夹下所有帧图片名字构成的列表,且列表内的图片名字的数字由小到大排列 frame_path = frame_path = os.path.join(prefix, frames[-1]) frame = cv2.imread(frame_path) frame = prep_im_for_blob(frame, cfg.PIXEL_MEANS, tuple(cfg.TRAIN.FRAME_SIZE[::-1]), cfg.TRAIN.CROP_SIZE, random_idx) if item['flipped']: # 不走这条路(flipped=False) frame = frame[:, ::-1, :] if DEBUG: # 不走这条路(DEBUG=False) cv2.imshow('frame', frame / 255.0) cv2.waitKey(0) cv2.destroyAllWindows() video[ j] = frame # 把每一个时序片段的所有帧重新装入video这个列表里,video形状:[video_info[2]-video_info[1], 112, 112, 3] j = j + 1 video[j:video_length] = video[ j - 1] # 若video长度不足512,则把video的最后一帧重复填充直至第512帧,最终video形状:[512, 112, 112, 3] processed_videos.append( video ) # 把每个video合在一起(可能是形成一个batch)(但实际上这级的for循环只会执行一次?所以batch_size在这里=1?) processed_videos的shape:[batch_size, 512, 112, 112, 3] # Create a blob to hold the input images, dimension trans CLHW blob = video_list_to_blob( processed_videos) # blob的shape:[batch_size, 3, 512, 112, 112] return blob # blob的shape:[batch_size, 3, 512, 112, 112]