def prepare_input(imgpath, bbox): cvimg = cv2.imread(imgpath, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) if not isinstance(cvimg, np.ndarray): raise IOError("Fail to read %s" % imgpath) img_height, img_width, img_channels = cvimg.shape scale, rot, do_flip, color_scale, do_occlusion = 1.0, 0.0, False, [ 1.0, 1.0, 1.0 ], False # 3. crop patch from img and perform data augmentation (flip, rot, color scale, synthetic occlusion) # Adds black padding and ensured 255x255 dimensions for input even by introducing stretching. img_patch, trans = generate_patch_image(cvimg, bbox, do_flip, scale, rot, do_occlusion) for i in range(img_channels): img_patch[:, :, i] = np.clip(img_patch[:, :, i] * color_scale[i], 0, 255) transform = transforms.Compose([ \ transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)] \ ) img_patch = transform(img_patch) input_img = img_patch.unsqueeze(0) return input_img
def get_input(image, person_boxes): person_images = np.zeros((len(person_boxes), 3, rootnet_cfg.input_shape[0], rootnet_cfg.input_shape[1])) k_values = np.zeros((len(person_boxes), 1)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=rootnet_cfg.pixel_mean, std=rootnet_cfg.pixel_std)] ) for i, box in enumerate(person_boxes): patch_image, _ = generate_patch_image(image, box, False, 0) person_images[i] = transform(patch_image) #k_value = np.array([math.sqrt(cfg.bbox_real[0]*cfg.bbox_real[1]*f[0]*f[1]/(area))]).astype(np.float32) in dataset.py k_values[i] = np.array( [math.sqrt(rootnet_cfg.bbox_real[0] * rootnet_cfg.bbox_real[1] * (image.shape[1]/2) * (image.shape[0]/2) / (box[3] * box[2]))]).astype( np.float32) person_images = torch.Tensor(person_images) k_values = torch.Tensor(k_values) return person_images, k_values
def rootnet(img, bboxlist): # prepare input image transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)]) original_img = img original_img_height, original_img_width = original_img.shape[:2] bbox_list = bboxlist person_num = len(bbox_list) root_list = [] # normalized camera intrinsics focal = [1500, 1500] # x-axis, y-axis for n in range(person_num): bbox = process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height) img, img2bb_trans = generate_patch_image(original_img, bbox, False, 0.0) img = transform(img).cuda()[None, :, :, :] k_value = np.array( [math.sqrt(cfg.bbox_real[0] * cfg.bbox_real[1] * focal[0] * focal[1] / (bbox[2] * bbox[3]))]).astype(np.float32) k_value = torch.FloatTensor([k_value]).cuda()[None, :] # forward with torch.no_grad(): root_3d = model(img, k_value) # x,y: pixel, z: root-relative depth (mm) img = img[0].cpu().numpy() root_3d = root_3d[0].cpu().numpy() # save output in 2D space (x,y: pixel) vis_img = img.copy() vis_img = vis_img * np.array(cfg.pixel_std).reshape(3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) vis_img = vis_img.astype(np.uint8) vis_img = vis_img[::-1, :, :] vis_img = np.transpose(vis_img, (1, 2, 0)).copy() vis_root = np.zeros((2)) vis_root[0] = root_3d[0] / cfg.output_shape[1] * cfg.input_shape[1] vis_root[1] = root_3d[1] / cfg.output_shape[0] * cfg.input_shape[0] cv2.circle(vis_img, (int(vis_root[0]), int(vis_root[1])), radius=5, color=(0, 255, 0), thickness=-1, lineType=cv2.LINE_AA) root_list.append(root_3d[2]) return root_list
assert len(bbox_list) == len(root_depth_list) person_num = len(bbox_list) # normalized camera intrinsics focal = [1500, 1500] # x-axis, y-axis princpt = [original_img_width / 2, original_img_height / 2] # x-axis, y-axis print('focal length: (' + str(focal[0]) + ', ' + str(focal[1]) + ')') print('principal points: (' + str(princpt[0]) + ', ' + str(princpt[1]) + ')') # for each cropped and resized human image, forward it to PoseNet output_pose_2d_list = [] output_pose_3d_list = [] for n in range(person_num): bbox = process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height) img, img2bb_trans = generate_patch_image(original_img, bbox, False, 1.0, 0.0, False) print(img.shape, person_num) img = transform(img).cuda()[None, :, :, :] # forward with torch.no_grad(): pose_3d = model(img) # x,y: pixel, z: root-relative depth (mm) # inverse affine transform (restore the crop and resize) pose_3d = pose_3d[0].cpu().numpy() pose_3d[:, 0] = pose_3d[:, 0] / cfg.output_shape[1] * cfg.input_shape[1] pose_3d[:, 1] = pose_3d[:, 1] / cfg.output_shape[0] * cfg.input_shape[0] pose_3d_xy1 = np.concatenate( (pose_3d[:, :2], np.ones_like(pose_3d[:, :1])), 1) img2bb_trans_001 = np.concatenate( (img2bb_trans, np.array([0, 0, 1]).reshape(1, 3)))