def test_interhand3d_demo(): # H36M demo pose_model = init_pose_model( 'configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/' 'res50_interhand3d_all_256x256.py', None, device='cpu') image_name = 'tests/data/interhand2.6m/image2017.jpg' det_result = { 'image_name': image_name, 'bbox': [50, 50, 50, 50], # bbox format is 'xywh' 'camera_param': None, 'keypoints_3d_gt': None } det_results = [det_result] dataset = pose_model.cfg.data['test']['type'] dataset_info = DatasetInfo(pose_model.cfg.data['test']['dataset_info']) pose_results = inference_interhand_3d_model(pose_model, image_name, det_results, dataset=dataset) for res in pose_results: res['title'] = 'title' vis_3d_pose_result( pose_model, result=pose_results, img=det_results[0]['image_name'], dataset_info=dataset_info, ) # test special cases # Empty det results _ = inference_interhand_3d_model(pose_model, image_name, [], dataset=dataset) if torch.cuda.is_available(): _ = inference_interhand_3d_model(pose_model.cuda(), image_name, det_results, dataset=dataset) with pytest.raises(NotImplementedError): _ = inference_interhand_3d_model(pose_model, image_name, det_results, dataset='test')
def test_pose_lifter_demo(): # H36M demo pose_model = init_pose_model( 'configs/body/3d_kpt_sview_rgb_img/pose_lift/' 'h36m/simplebaseline3d_h36m.py', None, device='cpu') pose_det_result = { 'keypoints': np.zeros((17, 3)), 'bbox': [50, 50, 50, 50], 'track_id': 0, 'image_name': 'tests/data/h36m/S1_Directions_1.54138969_000001.jpg', } pose_results_2d = [[pose_det_result]] dataset = pose_model.cfg.data['test']['type'] _ = inference_pose_lifter_model( pose_model, pose_results_2d, dataset, with_track_id=False) pose_lift_results = inference_pose_lifter_model( pose_model, pose_results_2d, dataset, with_track_id=True) for res in pose_lift_results: res['title'] = 'title' vis_3d_pose_result( pose_model, pose_lift_results, img=pose_lift_results[0]['image_name'], dataset=dataset) # test special cases # Empty 2D results _ = inference_pose_lifter_model( pose_model, [[]], dataset, with_track_id=False) if torch.cuda.is_available(): _ = inference_pose_lifter_model( pose_model.cuda(), pose_results_2d, dataset, with_track_id=False) with pytest.raises(NotImplementedError): _ = inference_pose_lifter_model( pose_model, pose_results_2d, dataset='test')
def test_pose_lifter_demo(): # H36M demo pose_model = init_pose_model( 'configs/body/3d_kpt_sview_rgb_img/pose_lift/' 'h36m/simplebaseline3d_h36m.py', None, device='cpu') pose_det_result = { 'keypoints': np.zeros((17, 3)), 'bbox': [50, 50, 50, 50], 'track_id': 0, 'image_name': 'tests/data/h36m/S1_Directions_1.54138969_000001.jpg', } pose_results_2d = [[pose_det_result]] dataset_info = DatasetInfo(pose_model.cfg.data['test']['dataset_info']) pose_results_2d = extract_pose_sequence(pose_results_2d, frame_idx=0, causal=False, seq_len=1, step=1) _ = inference_pose_lifter_model(pose_model, pose_results_2d, dataset_info=dataset_info, with_track_id=False) pose_lift_results = inference_pose_lifter_model(pose_model, pose_results_2d, dataset_info=dataset_info, with_track_id=True) for res in pose_lift_results: res['title'] = 'title' vis_3d_pose_result(pose_model, pose_lift_results, img=pose_results_2d[0][0]['image_name'], dataset_info=dataset_info) # test special cases # Empty 2D results _ = inference_pose_lifter_model(pose_model, [[]], dataset_info=dataset_info, with_track_id=False) if torch.cuda.is_available(): _ = inference_pose_lifter_model(pose_model.cuda(), pose_results_2d, dataset_info=dataset_info, with_track_id=False) # test videopose3d pose_model = init_pose_model( 'configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/' 'videopose3d_h36m_243frames_fullconv_supervised_cpn_ft.py', None, device='cpu') pose_det_result_0 = { 'keypoints': np.ones((17, 3)), 'bbox': [50, 50, 100, 100], 'track_id': 0, 'image_name': 'tests/data/h36m/S1_Directions_1.54138969_000001.jpg', } pose_det_result_1 = { 'keypoints': np.ones((17, 3)), 'bbox': [50, 50, 100, 100], 'track_id': 1, 'image_name': 'tests/data/h36m/S5_SittingDown.54138969_002061.jpg', } pose_det_result_2 = { 'keypoints': np.ones((17, 3)), 'bbox': [50, 50, 100, 100], 'track_id': 2, 'image_name': 'tests/data/h36m/S7_Greeting.55011271_000396.jpg', } pose_results_2d = [[pose_det_result_0], [pose_det_result_1], [pose_det_result_2]] dataset_info = DatasetInfo(pose_model.cfg.data['test']['dataset_info']) seq_len = pose_model.cfg.test_data_cfg.seq_len pose_results_2d_seq = extract_pose_sequence(pose_results_2d, 1, causal=False, seq_len=seq_len, step=1) pose_lift_results = inference_pose_lifter_model(pose_model, pose_results_2d_seq, dataset_info=dataset_info, with_track_id=True, image_size=[1000, 1000], norm_pose_2d=True) for res in pose_lift_results: res['title'] = 'title' vis_3d_pose_result( pose_model, pose_lift_results, img=pose_results_2d[0][0]['image_name'], dataset_info=dataset_info, )
def main(): parser = ArgumentParser() parser.add_argument('pose_lifter_config', help='Config file for the 2nd stage pose lifter model') parser.add_argument( 'pose_lifter_checkpoint', help='Checkpoint file for the 2nd stage pose lifter model') parser.add_argument('--pose-detector-conifig', type=str, default=None, help='Config file for the 1st stage 2D pose detector') parser.add_argument( '--pose-detector-checkpoint', type=str, default=None, help='Checkpoint file for the 1st stage 2D pose detector') parser.add_argument('--img-root', type=str, default='', help='Image root') parser.add_argument( '--json-file', type=str, default=None, help='Json file containing image and bbox inforamtion. Optionally,' 'The Jons file can also contain 2D pose information. See' '"only-second-stage"') parser.add_argument( '--camera-param-file', type=str, default=None, help='Camera parameter file for converting 3D pose predictions from ' ' the camera space to to world space. If None, no conversion will be ' 'applied.') parser.add_argument( '--only-second-stage', action='store_true', help='If true, load 2D pose detection result from the Json file and ' 'skip the 1st stage. The pose detection model will be ignored.') parser.add_argument( '--rebase-keypoint-height', action='store_true', help='Rebase the predicted 3D pose so its lowest keypoint has a ' 'height of 0 (landing on the ground). This is useful for ' 'visualization when the model do not predict the global position ' 'of the 3D pose.') parser.add_argument( '--show-ground-truth', action='store_true', help='If True, show ground truth if it is available. The ground truth ' 'should be contained in the annotations in the Json file with the key ' '"keypoints_3d" for each instance.') parser.add_argument('--show', action='store_true', default=False, help='whether to show img') parser.add_argument('--out-img-root', type=str, default=None, help='Root of the output visualization images. ' 'Default not saving the visualization images.') parser.add_argument('--device', default='cuda:0', help='Device for inference') parser.add_argument('--kpt-thr', type=float, default=0.3) parser.add_argument('--radius', type=int, default=4, help='Keypoint radius for visualization') parser.add_argument('--thickness', type=int, default=1, help='Link thickness for visualization') args = parser.parse_args() assert args.show or (args.out_img_root != '') coco = COCO(args.json_file) # First stage: 2D pose detection pose_det_results_list = [] if args.only_second_stage: from mmpose.apis.inference import _xywh2xyxy print('Stage 1: load 2D pose results from Json file.') for image_id, image in coco.imgs.items(): image_name = osp.join(args.img_root, image['file_name']) ann_ids = coco.getAnnIds(image_id) pose_det_results = [] for ann_id in ann_ids: ann = coco.anns[ann_id] keypoints = np.array(ann['keypoints']).reshape(-1, 3) keypoints[..., 2] = keypoints[..., 2] >= 1 keypoints_3d = np.array(ann['keypoints_3d']).reshape(-1, 4) keypoints_3d[..., 3] = keypoints_3d[..., 3] >= 1 bbox = np.array(ann['bbox']).reshape(1, -1) pose_det_result = { 'image_name': image_name, 'bbox': _xywh2xyxy(bbox), 'keypoints': keypoints, 'keypoints_3d': keypoints_3d } pose_det_results.append(pose_det_result) pose_det_results_list.append(pose_det_results) else: print('Stage 1: 2D pose detection.') pose_det_model = init_pose_model(args.pose_detector_config, args.pose_detector_checkpoint, device=args.device.lower()) assert pose_det_model.cfg.model.type == 'TopDown', 'Only "TopDown"' \ 'model is supported for the 1st stage (2D pose detection)' dataset = pose_det_model.cfg.data['test']['type'] img_keys = list(coco.imgs.keys()) for i in mmcv.track_iter_progress(range(len(img_keys))): # get bounding box annotations image_id = img_keys[i] image = coco.loadImgs(image_id)[0] image_name = osp.join(args.img_root, image['file_name']) ann_ids = coco.getAnnIds(image_id) # make person results for single image person_results = [] for ann_id in ann_ids: person = {} ann = coco.anns[ann_id] person['bbox'] = ann['bbox'] person_results.append(person) pose_det_results, _ = inference_top_down_pose_model( pose_det_model, image_name, person_results, bbox_thr=None, format='xywh', dataset=dataset, return_heatmap=False, outputs=None) for res in pose_det_results: res['image_name'] = image_name pose_det_results_list.append(pose_det_results) # Second stage: Pose lifting print('Stage 2: 2D-to-3D pose lifting.') pose_lift_model = init_pose_model(args.pose_lifter_config, args.pose_lifter_checkpoint, device=args.device.lower()) assert pose_lift_model.cfg.model.type == 'PoseLifter', 'Only' \ '"PoseLifter" model is supported for the 2nd stage ' \ '(2D-to-3D lifting)' dataset = pose_lift_model.cfg.data['test']['type'] camera_params = None if args.camera_param_file is not None: camera_params = mmcv.load(args.camera_param_file) for i, pose_det_results in enumerate( mmcv.track_iter_progress(pose_det_results_list)): # 2D-to-3D pose lifting # Note that the pose_det_results are regarded as a single-frame pose # sequence pose_lift_results = inference_pose_lifter_model( pose_lift_model, pose_results_2d=[pose_det_results], dataset=dataset, with_track_id=False) image_name = pose_det_results[0]['image_name'] # Pose processing pose_lift_results_vis = [] for idx, res in enumerate(pose_lift_results): keypoints_3d = res['keypoints_3d'] # project to world space if camera_params is not None: keypoints_3d = _keypoint_camera_to_world( keypoints_3d, camera_params=camera_params, image_name=image_name, dataset=dataset) # rebase height (z-axis) if args.rebase_keypoint_height: keypoints_3d[..., 2] -= np.min(keypoints_3d[..., 2], axis=-1, keepdims=True) res['keypoints_3d'] = keypoints_3d # Add title det_res = pose_det_results[idx] instance_id = det_res.get('track_id', idx) res['title'] = f'Prediction ({instance_id})' pose_lift_results_vis.append(res) # Add ground truth if args.show_ground_truth: if 'keypoints_3d' not in det_res: print('Fail to show ground truth. Please make sure that' ' the instance annotations from the Json file' ' contain "keypoints_3d".') else: gt = res.copy() gt['keypoints_3d'] = det_res['keypoints_3d'] gt['title'] = f'Ground truth ({instance_id})' pose_lift_results_vis.append(gt) # Visualization if args.out_img_root is None: out_file = None else: os.makedirs(args.out_img_root, exist_ok=True) out_file = osp.join(args.out_img_root, f'vis_{i}.jpg') vis_3d_pose_result(pose_lift_model, result=pose_lift_results_vis, img=pose_lift_results[0]['image_name'], out_file=out_file)
def main(): parser = ArgumentParser() parser.add_argument('det_config', help='Config file for detection') parser.add_argument('det_checkpoint', help='Checkpoint file for detection') parser.add_argument( 'pose_detector_config', type=str, default=None, help='Config file for the 1st stage 2D pose detector') parser.add_argument( 'pose_detector_checkpoint', type=str, default=None, help='Checkpoint file for the 1st stage 2D pose detector') parser.add_argument( 'pose_lifter_config', help='Config file for the 2nd stage pose lifter model') parser.add_argument( 'pose_lifter_checkpoint', help='Checkpoint file for the 2nd stage pose lifter model') parser.add_argument( '--video-path', type=str, default='', help='Video path') parser.add_argument( '--rebase-keypoint-height', action='store_true', help='Rebase the predicted 3D pose so its lowest keypoint has a ' 'height of 0 (landing on the ground). This is useful for ' 'visualization when the model do not predict the global position ' 'of the 3D pose.') parser.add_argument( '--norm-pose-2d', action='store_true', help='Scale the bbox (along with the 2D pose) to the average bbox ' 'scale of the dataset, and move the bbox (along with the 2D pose) to ' 'the average bbox center of the dataset. This is useful when bbox ' 'is small, especially in multi-person scenarios.') parser.add_argument( '--num-instances', type=int, default=-1, help='The number of 3D poses to be visualized in every frame. If ' 'less than 0, it will be set to the number of pose results in the ' 'first frame.') parser.add_argument( '--show', action='store_true', default=False, help='whether to show visualizations.') parser.add_argument( '--out-video-root', type=str, default=None, help='Root of the output video file. ' 'Default not saving the visualization video.') parser.add_argument( '--device', default='cuda:0', help='Device for inference') parser.add_argument( '--det-cat-id', type=int, default=1, help='Category id for bounding box detection model') parser.add_argument( '--bbox-thr', type=float, default=0.9, help='Bounding box score threshold') parser.add_argument('--kpt-thr', type=float, default=0.3) parser.add_argument( '--use-oks-tracking', action='store_true', help='Using OKS tracking') parser.add_argument( '--tracking-thr', type=float, default=0.3, help='Tracking threshold') parser.add_argument( '--euro', action='store_true', help='Using One_Euro_Filter for smoothing') parser.add_argument( '--radius', type=int, default=8, help='Keypoint radius for visualization') parser.add_argument( '--thickness', type=int, default=2, help='Link thickness for visualization') assert has_mmdet, 'Please install mmdet to run the demo.' args = parser.parse_args() assert args.show or (args.out_video_root != '') assert args.det_config is not None assert args.det_checkpoint is not None video = mmcv.VideoReader(args.video_path) assert video.opened, f'Failed to load video file {args.video_path}' # First stage: 2D pose detection print('Stage 1: 2D pose detection.') person_det_model = init_detector( args.det_config, args.det_checkpoint, device=args.device.lower()) pose_det_model = init_pose_model( args.pose_detector_config, args.pose_detector_checkpoint, device=args.device.lower()) assert pose_det_model.cfg.model.type == 'TopDown', 'Only "TopDown"' \ 'model is supported for the 1st stage (2D pose detection)' pose_det_dataset = pose_det_model.cfg.data['test']['type'] pose_det_results_list = [] next_id = 0 pose_det_results = [] for frame in video: pose_det_results_last = pose_det_results # test a single image, the resulting box is (x1, y1, x2, y2) mmdet_results = inference_detector(person_det_model, frame) # keep the person class bounding boxes. person_det_results = process_mmdet_results(mmdet_results, args.det_cat_id) # make person results for single image pose_det_results, _ = inference_top_down_pose_model( pose_det_model, frame, person_det_results, bbox_thr=args.bbox_thr, format='xyxy', dataset=pose_det_dataset, return_heatmap=False, outputs=None) # get track id for each person instance pose_det_results, next_id = get_track_id( pose_det_results, pose_det_results_last, next_id, use_oks=args.use_oks_tracking, tracking_thr=args.tracking_thr, use_one_euro=args.euro, fps=video.fps) pose_det_results_list.append(copy.deepcopy(pose_det_results)) # Second stage: Pose lifting print('Stage 2: 2D-to-3D pose lifting.') pose_lift_model = init_pose_model( args.pose_lifter_config, args.pose_lifter_checkpoint, device=args.device.lower()) assert pose_lift_model.cfg.model.type == 'PoseLifter', \ 'Only "PoseLifter" model is supported for the 2nd stage ' \ '(2D-to-3D lifting)' pose_lift_dataset = pose_lift_model.cfg.data['test']['type'] if args.out_video_root == '': save_out_video = False else: os.makedirs(args.out_video_root, exist_ok=True) save_out_video = True if save_out_video: fourcc = cv2.VideoWriter_fourcc(*'mp4v') fps = video.fps writer = None # convert keypoint definition for pose_det_results in pose_det_results_list: for res in pose_det_results: keypoints = res['keypoints'] res['keypoints'] = covert_keypoint_definition( keypoints, pose_det_dataset, pose_lift_dataset) # load temporal padding config from model.data_cfg if hasattr(pose_lift_model.cfg, 'test_data_cfg'): data_cfg = pose_lift_model.cfg.test_data_cfg else: data_cfg = pose_lift_model.cfg.data_cfg num_instances = args.num_instances for i, pose_det_results in enumerate( mmcv.track_iter_progress(pose_det_results_list)): # extract and pad input pose2d sequence pose_results_2d = extract_pose_sequence( pose_det_results_list, frame_idx=i, causal=data_cfg.causal, seq_len=data_cfg.seq_len, step=data_cfg.seq_frame_interval) # 2D-to-3D pose lifting pose_lift_results = inference_pose_lifter_model( pose_lift_model, pose_results_2d=pose_results_2d, dataset=pose_lift_dataset, with_track_id=True, image_size=video.resolution, norm_pose_2d=args.norm_pose_2d) # Pose processing pose_lift_results_vis = [] for idx, res in enumerate(pose_lift_results): keypoints_3d = res['keypoints_3d'] # exchange y,z-axis, and then reverse the direction of x,z-axis keypoints_3d = keypoints_3d[..., [0, 2, 1]] keypoints_3d[..., 0] = -keypoints_3d[..., 0] keypoints_3d[..., 2] = -keypoints_3d[..., 2] # rebase height (z-axis) if args.rebase_keypoint_height: keypoints_3d[..., 2] -= np.min( keypoints_3d[..., 2], axis=-1, keepdims=True) res['keypoints_3d'] = keypoints_3d # add title det_res = pose_det_results[idx] instance_id = det_res['track_id'] res['title'] = f'Prediction ({instance_id})' # only visualize the target frame res['keypoints'] = det_res['keypoints'] res['bbox'] = det_res['bbox'] res['track_id'] = instance_id pose_lift_results_vis.append(res) # Visualization if num_instances < 0: num_instances = len(pose_lift_results_vis) img_vis = vis_3d_pose_result( pose_lift_model, result=pose_lift_results_vis, img=video[i], out_file=None, radius=args.radius, thickness=args.thickness, num_instances=num_instances) if save_out_video: if writer is None: writer = cv2.VideoWriter( osp.join(args.out_video_root, f'vis_{osp.basename(args.video_path)}'), fourcc, fps, (img_vis.shape[1], img_vis.shape[0])) writer.write(img_vis) if save_out_video: writer.release()
def main(): parser = ArgumentParser() parser.add_argument('pose_config', help='Config file for pose network') parser.add_argument('pose_checkpoint', help='Checkpoint file') parser.add_argument('--img-root', type=str, default='', help='Image root') parser.add_argument('--json-file', type=str, default='', help='Json file containing image info.') parser.add_argument( '--camera-param-file', type=str, default=None, help='Camera parameter file for converting 3D pose predictions from ' ' the pixel space to camera space. If None, keypoints in pixel space' 'will be visualized') parser.add_argument( '--gt-joints-file', type=str, default=None, help='Optional arguement. Ground truth 3D keypoint parameter file. ' 'If None, gt keypoints will not be shown and keypoints in pixel ' 'space will be visualized.') parser.add_argument( '--rebase-keypoint-height', action='store_true', help='Rebase the predicted 3D pose so its lowest keypoint has a ' 'height of 0 (landing on the ground). This is useful for ' 'visualization when the model do not predict the global position ' 'of the 3D pose.') parser.add_argument( '--show-ground-truth', action='store_true', help='If True, show ground truth keypoint if it is available.') parser.add_argument('--show', action='store_true', default=False, help='whether to show img') parser.add_argument('--out-img-root', type=str, default=None, help='Root of the output visualization images. ' 'Default not saving the visualization images.') parser.add_argument('--device', default='cuda:0', help='Device for inference') parser.add_argument('--kpt-thr', type=float, default=0.3, help='Keypoint score threshold') parser.add_argument('--radius', type=int, default=4, help='Keypoint radius for visualization') parser.add_argument('--thickness', type=int, default=1, help='Link thickness for visualization') args = parser.parse_args() assert args.show or (args.out_img_root != '') coco = COCO(args.json_file) # build the pose model from a config file and a checkpoint file pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, device=args.device.lower()) dataset = pose_model.cfg.data['test']['type'] # load camera parameters camera_params = None if args.camera_param_file is not None: camera_params = mmcv.load(args.camera_param_file) # load ground truth joints parameters gt_joint_params = None if args.gt_joints_file is not None: gt_joint_params = mmcv.load(args.gt_joints_file) # load hand bounding boxes det_results_list = [] for image_id, image in coco.imgs.items(): image_name = osp.join(args.img_root, image['file_name']) ann_ids = coco.getAnnIds(image_id) det_results = [] capture_key = str(image['capture']) camera_key = image['camera'] frame_idx = image['frame_idx'] for ann_id in ann_ids: ann = coco.anns[ann_id] if camera_params is not None: camera_param = { key: camera_params[capture_key][key][camera_key] for key in camera_params[capture_key].keys() } camera_param = _transform_interhand_camera_param(camera_param) else: camera_param = None if gt_joint_params is not None: joint_param = gt_joint_params[capture_key][str(frame_idx)] gt_joint = np.concatenate([ np.array(joint_param['world_coord']), np.array(joint_param['joint_valid']) ], axis=-1) else: gt_joint = None det_result = { 'image_name': image_name, 'bbox': ann['bbox'], # bbox format is 'xywh' 'camera_param': camera_param, 'keypoints_3d_gt': gt_joint } det_results.append(det_result) det_results_list.append(det_results) for i, det_results in enumerate( mmcv.track_iter_progress(det_results_list)): image_name = det_results[0]['image_name'] pose_results = inference_interhand_3d_model(pose_model, image_name, det_results, dataset=dataset) # Post processing pose_results_vis = [] for idx, res in enumerate(pose_results): keypoints_3d = res['keypoints_3d'] # normalize kpt score if keypoints_3d[:, 3].max() > 1: keypoints_3d[:, 3] /= 255 # get 2D keypoints in pixel space res['keypoints'] = keypoints_3d[:, [0, 1, 3]] # For model-predicted keypoints, channel 0 and 1 are coordinates # in pixel space, and channel 2 is the depth (in mm) relative # to root joints. # If both camera parameter and absolute depth of root joints are # provided, we can transform keypoint to camera space for better # visualization. camera_param = res['camera_param'] keypoints_3d_gt = res['keypoints_3d_gt'] if camera_param is not None and keypoints_3d_gt is not None: # build camera model camera = SimpleCamera(camera_param) # transform gt joints from world space to camera space keypoints_3d_gt[:, :3] = camera.world_to_camera( keypoints_3d_gt[:, :3]) # transform relative depth to absolute depth keypoints_3d[:21, 2] += keypoints_3d_gt[20, 2] keypoints_3d[21:, 2] += keypoints_3d_gt[41, 2] # transform keypoints from pixel space to camera space keypoints_3d[:, :3] = camera.pixel_to_camera( keypoints_3d[:, :3]) # rotate the keypoint to make z-axis correspondent to height # for better visualization vis_R = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) keypoints_3d[:, :3] = keypoints_3d[:, :3] @ vis_R if keypoints_3d_gt is not None: keypoints_3d_gt[:, :3] = keypoints_3d_gt[:, :3] @ vis_R # rebase height (z-axis) if args.rebase_keypoint_height: valid = keypoints_3d[..., 3] > 0 keypoints_3d[..., 2] -= np.min(keypoints_3d[valid, 2], axis=-1, keepdims=True) res['keypoints_3d'] = keypoints_3d res['keypoints_3d_gt'] = keypoints_3d_gt # Add title instance_id = res.get('track_id', idx) res['title'] = f'Prediction ({instance_id})' pose_results_vis.append(res) # Add ground truth if args.show_ground_truth: if keypoints_3d_gt is None: print('Fail to show ground truth. Please make sure that' ' gt-joints-file is provided.') else: gt = res.copy() if args.rebase_keypoint_height: valid = keypoints_3d_gt[..., 3] > 0 keypoints_3d_gt[..., 2] -= np.min(keypoints_3d_gt[valid, 2], axis=-1, keepdims=True) gt['keypoints_3d'] = keypoints_3d_gt gt['title'] = f'Ground truth ({instance_id})' pose_results_vis.append(gt) # Visualization if args.out_img_root is None: out_file = None else: os.makedirs(args.out_img_root, exist_ok=True) out_file = osp.join(args.out_img_root, f'vis_{i}.jpg') vis_3d_pose_result( pose_model, result=pose_results_vis, img=det_results[0]['image_name'], out_file=out_file, dataset=dataset, show=args.show, kpt_score_thr=args.kpt_thr, radius=args.radius, thickness=args.thickness, )
def test_inference_without_dataset_info(): # Top down pose_model = init_pose_model( 'configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/' 'coco/res50_coco_256x192.py', None, device='cpu') if 'dataset_info' in pose_model.cfg: _ = pose_model.cfg.pop('dataset_info') image_name = 'tests/data/coco/000000000785.jpg' person_result = [] person_result.append({'bbox': [50, 50, 50, 100]}) with pytest.warns(DeprecationWarning): pose_results, _ = inference_top_down_pose_model(pose_model, image_name, person_result, format='xywh') with pytest.warns(DeprecationWarning): vis_pose_result(pose_model, image_name, pose_results) with pytest.raises(NotImplementedError): with pytest.warns(DeprecationWarning): pose_results, _ = inference_top_down_pose_model(pose_model, image_name, person_result, format='xywh', dataset='test') # Bottom up pose_model = init_pose_model( 'configs/body/2d_kpt_sview_rgb_img/associative_embedding/' 'coco/res50_coco_512x512.py', None, device='cpu') if 'dataset_info' in pose_model.cfg: _ = pose_model.cfg.pop('dataset_info') image_name = 'tests/data/coco/000000000785.jpg' with pytest.warns(DeprecationWarning): pose_results, _ = inference_bottom_up_pose_model( pose_model, image_name) with pytest.warns(DeprecationWarning): vis_pose_result(pose_model, image_name, pose_results) # Top down tracking pose_model = init_pose_model( 'configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/' 'coco/res50_coco_256x192.py', None, device='cpu') if 'dataset_info' in pose_model.cfg: _ = pose_model.cfg.pop('dataset_info') image_name = 'tests/data/coco/000000000785.jpg' person_result = [{'bbox': [50, 50, 50, 100]}] with pytest.warns(DeprecationWarning): pose_results, _ = inference_top_down_pose_model(pose_model, image_name, person_result, format='xywh') pose_results, _ = get_track_id(pose_results, [], next_id=0) with pytest.warns(DeprecationWarning): vis_pose_tracking_result(pose_model, image_name, pose_results) with pytest.raises(NotImplementedError): with pytest.warns(DeprecationWarning): vis_pose_tracking_result(pose_model, image_name, pose_results, dataset='test') # Bottom up tracking pose_model = init_pose_model( 'configs/body/2d_kpt_sview_rgb_img/associative_embedding/' 'coco/res50_coco_512x512.py', None, device='cpu') if 'dataset_info' in pose_model.cfg: _ = pose_model.cfg.pop('dataset_info') image_name = 'tests/data/coco/000000000785.jpg' with pytest.warns(DeprecationWarning): pose_results, _ = inference_bottom_up_pose_model( pose_model, image_name) pose_results, next_id = get_track_id(pose_results, [], next_id=0) with pytest.warns(DeprecationWarning): vis_pose_tracking_result(pose_model, image_name, pose_results, dataset='BottomUpCocoDataset') # Pose lifting pose_model = init_pose_model( 'configs/body/3d_kpt_sview_rgb_img/pose_lift/' 'h36m/simplebaseline3d_h36m.py', None, device='cpu') pose_det_result = { 'keypoints': np.zeros((17, 3)), 'bbox': [50, 50, 50, 50], 'track_id': 0, 'image_name': 'tests/data/h36m/S1_Directions_1.54138969_000001.jpg', } if 'dataset_info' in pose_model.cfg: _ = pose_model.cfg.pop('dataset_info') pose_results_2d = [[pose_det_result]] dataset = pose_model.cfg.data['test']['type'] pose_results_2d = extract_pose_sequence(pose_results_2d, frame_idx=0, causal=False, seq_len=1, step=1) with pytest.warns(DeprecationWarning): _ = inference_pose_lifter_model(pose_model, pose_results_2d, dataset, with_track_id=False) with pytest.warns(DeprecationWarning): pose_lift_results = inference_pose_lifter_model(pose_model, pose_results_2d, dataset, with_track_id=True) for res in pose_lift_results: res['title'] = 'title' with pytest.warns(DeprecationWarning): vis_3d_pose_result(pose_model, pose_lift_results, img=pose_results_2d[0][0]['image_name'], dataset=dataset) with pytest.raises(NotImplementedError): with pytest.warns(DeprecationWarning): _ = inference_pose_lifter_model(pose_model, pose_results_2d, dataset='test')
def main(): parser = ArgumentParser() parser.add_argument('det_config', help='Config file for detection') parser.add_argument('det_checkpoint', help='Checkpoint file for detection') parser.add_argument('pose_detector_config', type=str, default=None, help='Config file for the 1st stage 2D pose detector') parser.add_argument( 'pose_detector_checkpoint', type=str, default=None, help='Checkpoint file for the 1st stage 2D pose detector') parser.add_argument('pose_lifter_config', help='Config file for the 2nd stage pose lifter model') parser.add_argument( 'pose_lifter_checkpoint', help='Checkpoint file for the 2nd stage pose lifter model') parser.add_argument('--video-path', type=str, default='', help='Video path') parser.add_argument( '--rebase-keypoint-height', action='store_true', help='Rebase the predicted 3D pose so its lowest keypoint has a ' 'height of 0 (landing on the ground). This is useful for ' 'visualization when the model do not predict the global position ' 'of the 3D pose.') parser.add_argument( '--norm-pose-2d', action='store_true', help='Scale the bbox (along with the 2D pose) to the average bbox ' 'scale of the dataset, and move the bbox (along with the 2D pose) to ' 'the average bbox center of the dataset. This is useful when bbox ' 'is small, especially in multi-person scenarios.') parser.add_argument( '--num-instances', type=int, default=-1, help='The number of 3D poses to be visualized in every frame. If ' 'less than 0, it will be set to the number of pose results in the ' 'first frame.') parser.add_argument('--show', action='store_true', default=False, help='whether to show visualizations.') parser.add_argument('--out-video-root', type=str, default='vis_results', help='Root of the output video file. ' 'Default not saving the visualization video.') parser.add_argument('--device', default='cuda:0', help='Device for inference') parser.add_argument('--det-cat-id', type=int, default=1, help='Category id for bounding box detection model') parser.add_argument('--bbox-thr', type=float, default=0.9, help='Bounding box score threshold') parser.add_argument('--kpt-thr', type=float, default=0.3) parser.add_argument('--use-oks-tracking', action='store_true', help='Using OKS tracking') parser.add_argument('--tracking-thr', type=float, default=0.3, help='Tracking threshold') parser.add_argument('--radius', type=int, default=8, help='Keypoint radius for visualization') parser.add_argument('--thickness', type=int, default=2, help='Link thickness for visualization') parser.add_argument( '--smooth', action='store_true', help='Apply a temporal filter to smooth the 2D pose estimation ' 'results. See also --smooth-filter-cfg.') parser.add_argument( '--smooth-filter-cfg', type=str, default='configs/_base_/filters/one_euro.py', help='Config file of the filter to smooth the pose estimation ' 'results. See also --smooth.') parser.add_argument( '--use-multi-frames', action='store_true', default=False, help='whether to use multi frames for inference in the 2D pose' 'detection stage. Default: False.') parser.add_argument( '--online', action='store_true', default=False, help='inference mode. If set to True, can not use future frame' 'information when using multi frames for inference in the 2D pose' 'detection stage. Default: False.') assert has_mmdet, 'Please install mmdet to run the demo.' args = parser.parse_args() assert args.show or (args.out_video_root != '') assert args.det_config is not None assert args.det_checkpoint is not None video = mmcv.VideoReader(args.video_path) assert video.opened, f'Failed to load video file {args.video_path}' # First stage: 2D pose detection print('Stage 1: 2D pose detection.') print('Initializing model...') person_det_model = init_detector(args.det_config, args.det_checkpoint, device=args.device.lower()) pose_det_model = init_pose_model(args.pose_detector_config, args.pose_detector_checkpoint, device=args.device.lower()) assert isinstance(pose_det_model, TopDown), 'Only "TopDown"' \ 'model is supported for the 1st stage (2D pose detection)' # frame index offsets for inference, used in multi-frame inference setting if args.use_multi_frames: assert 'frame_indices_test' in pose_det_model.cfg.data.test.data_cfg indices = pose_det_model.cfg.data.test.data_cfg['frame_indices_test'] pose_det_dataset = pose_det_model.cfg.data['test']['type'] # get datasetinfo dataset_info = pose_det_model.cfg.data['test'].get('dataset_info', None) if dataset_info is None: warnings.warn( 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) else: dataset_info = DatasetInfo(dataset_info) pose_det_results_list = [] next_id = 0 pose_det_results = [] # whether to return heatmap, optional return_heatmap = False # return the output of some desired layers, # e.g. use ('backbone', ) to return backbone feature output_layer_names = None print('Running 2D pose detection inference...') for frame_id, cur_frame in enumerate(mmcv.track_iter_progress(video)): pose_det_results_last = pose_det_results # test a single image, the resulting box is (x1, y1, x2, y2) mmdet_results = inference_detector(person_det_model, cur_frame) # keep the person class bounding boxes. person_det_results = process_mmdet_results(mmdet_results, args.det_cat_id) if args.use_multi_frames: frames = collect_multi_frames(video, frame_id, indices, args.online) # make person results for current image pose_det_results, _ = inference_top_down_pose_model( pose_det_model, frames if args.use_multi_frames else cur_frame, person_det_results, bbox_thr=args.bbox_thr, format='xyxy', dataset=pose_det_dataset, dataset_info=dataset_info, return_heatmap=return_heatmap, outputs=output_layer_names) # get track id for each person instance pose_det_results, next_id = get_track_id( pose_det_results, pose_det_results_last, next_id, use_oks=args.use_oks_tracking, tracking_thr=args.tracking_thr) pose_det_results_list.append(copy.deepcopy(pose_det_results)) # Second stage: Pose lifting print('Stage 2: 2D-to-3D pose lifting.') print('Initializing model...') pose_lift_model = init_pose_model(args.pose_lifter_config, args.pose_lifter_checkpoint, device=args.device.lower()) assert isinstance(pose_lift_model, PoseLifter), \ 'Only "PoseLifter" model is supported for the 2nd stage ' \ '(2D-to-3D lifting)' pose_lift_dataset = pose_lift_model.cfg.data['test']['type'] if args.out_video_root == '': save_out_video = False else: os.makedirs(args.out_video_root, exist_ok=True) save_out_video = True if save_out_video: fourcc = cv2.VideoWriter_fourcc(*'mp4v') fps = video.fps writer = None # convert keypoint definition for pose_det_results in pose_det_results_list: for res in pose_det_results: keypoints = res['keypoints'] res['keypoints'] = convert_keypoint_definition( keypoints, pose_det_dataset, pose_lift_dataset) # load temporal padding config from model.data_cfg if hasattr(pose_lift_model.cfg, 'test_data_cfg'): data_cfg = pose_lift_model.cfg.test_data_cfg else: data_cfg = pose_lift_model.cfg.data_cfg # build pose smoother for temporal refinement if args.smooth: smoother = Smoother(filter_cfg=args.smooth_filter_cfg, keypoint_key='keypoints', keypoint_dim=2) else: smoother = None num_instances = args.num_instances pose_lift_dataset_info = pose_lift_model.cfg.data['test'].get( 'dataset_info', None) if pose_lift_dataset_info is None: warnings.warn( 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) else: pose_lift_dataset_info = DatasetInfo(pose_lift_dataset_info) print('Running 2D-to-3D pose lifting inference...') for i, pose_det_results in enumerate( mmcv.track_iter_progress(pose_det_results_list)): # extract and pad input pose2d sequence pose_results_2d = extract_pose_sequence( pose_det_results_list, frame_idx=i, causal=data_cfg.causal, seq_len=data_cfg.seq_len, step=data_cfg.seq_frame_interval) # smooth 2d results if smoother: pose_results_2d = smoother.smooth(pose_results_2d) # 2D-to-3D pose lifting pose_lift_results = inference_pose_lifter_model( pose_lift_model, pose_results_2d=pose_results_2d, dataset=pose_lift_dataset, dataset_info=pose_lift_dataset_info, with_track_id=True, image_size=video.resolution, norm_pose_2d=args.norm_pose_2d) # Pose processing pose_lift_results_vis = [] for idx, res in enumerate(pose_lift_results): keypoints_3d = res['keypoints_3d'] # exchange y,z-axis, and then reverse the direction of x,z-axis keypoints_3d = keypoints_3d[..., [0, 2, 1]] keypoints_3d[..., 0] = -keypoints_3d[..., 0] keypoints_3d[..., 2] = -keypoints_3d[..., 2] # rebase height (z-axis) if args.rebase_keypoint_height: keypoints_3d[..., 2] -= np.min(keypoints_3d[..., 2], axis=-1, keepdims=True) res['keypoints_3d'] = keypoints_3d # add title det_res = pose_det_results[idx] instance_id = det_res['track_id'] res['title'] = f'Prediction ({instance_id})' # only visualize the target frame res['keypoints'] = det_res['keypoints'] res['bbox'] = det_res['bbox'] res['track_id'] = instance_id pose_lift_results_vis.append(res) # Visualization if num_instances < 0: num_instances = len(pose_lift_results_vis) img_vis = vis_3d_pose_result(pose_lift_model, result=pose_lift_results_vis, img=video[i], dataset=pose_lift_dataset, dataset_info=pose_lift_dataset_info, out_file=None, radius=args.radius, thickness=args.thickness, num_instances=num_instances, show=args.show) if save_out_video: if writer is None: writer = cv2.VideoWriter( osp.join(args.out_video_root, f'vis_{osp.basename(args.video_path)}'), fourcc, fps, (img_vis.shape[1], img_vis.shape[0])) writer.write(img_vis) if save_out_video: writer.release()