def __init__(self, ann_file, img_prefix, data_cfg, pipeline, dataset_info=None, coco_style=True, test_mode=False): self.image_info = {} self.ann_info = {} self.ann_file = ann_file self.img_prefix = img_prefix self.pipeline = pipeline self.test_mode = test_mode self.ann_info['image_size'] = np.array(data_cfg['image_size']) self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) self.ann_info['num_joints'] = data_cfg['num_joints'] self.ann_info['inference_channel'] = data_cfg['inference_channel'] self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] self.ann_info['use_different_joint_weights'] = data_cfg.get( 'use_different_joint_weights', False) if dataset_info is None: raise ValueError( 'Check https://github.com/open-mmlab/mmpose/pull/663 ' 'for details.') dataset_info = DatasetInfo(dataset_info) assert self.ann_info['num_joints'] == dataset_info.keypoint_num self.ann_info['flip_pairs'] = dataset_info.flip_pairs self.ann_info['flip_index'] = dataset_info.flip_index self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids self.ann_info['joint_weights'] = dataset_info.joint_weights self.ann_info['skeleton'] = dataset_info.skeleton self.sigmas = dataset_info.sigmas self.dataset_name = dataset_info.dataset_name if coco_style: self.coco = COCO(ann_file) if 'categories' in self.coco.dataset: cats = [ cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) ] self.classes = ['__background__'] + cats self.num_classes = len(self.classes) self._class_to_ind = dict( zip(self.classes, range(self.num_classes))) self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds())) self._coco_ind_to_class_ind = dict( (self._class_to_coco_ind[cls], self._class_to_ind[cls]) for cls in self.classes[1:]) self.img_ids = self.coco.getImgIds() self.num_images = len(self.img_ids) self.id2name, self.name2id = self._get_mapping_id_name( self.coco.imgs) self.db = [] self.pipeline = Compose(self.pipeline)
def _inference_single_pose_model(model, img_or_path, bbox, dataset, return_heatmap=False): """Inference a single bbox. num_keypoints: K Args: model (nn.Module): The loaded pose model. image_name (str | np.ndarray):Image_name bbox (list | np.ndarray): Bounding boxes (with scores), shaped (4, ) or (5, ). (left, top, width, height, [score]) dataset (str): Dataset name. outputs (list[str] | tuple[str]): Names of layers whose output is to be returned, default: None Returns: ndarray[Kx3]: Predicted pose x, y, score. heatmap[N, K, H, W]: Model output heatmap. """ cfg = model.cfg device = next(model.parameters()).device # build the data pipeline test_pipeline = [LoadImage()] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) assert len(bbox) in [4, 5] center, scale = _box2cs(cfg, bbox) flip_pairs = None if dataset == 'TopDownCocoDataset' or dataset == 'TopDownOCHumanDataset': flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] elif dataset == 'TopDownCocoWholeBodyDataset': body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] foot = [[17, 20], [18, 21], [19, 22]] face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], [111, 132]] flip_pairs = body + foot + face + hand elif dataset == 'TopDownAicDataset': flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] elif (dataset == 'TopDownOneHand10KDataset' or dataset == 'TopDownFreiHandDataset' or dataset == 'TopDownPanopticDataset'): flip_pairs = [] else: raise NotImplementedError() # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'dataset': dataset, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] # forward the model with torch.no_grad(): all_preds, _, _, heatmap = model(return_loss=False, return_heatmap=return_heatmap, img=data['img'], img_metas=data['img_metas']) return all_preds[0], heatmap
def inference_pose_lifter_model(model, pose_results_2d, dataset, with_track_id=True): """Inference 3D pose from 2D pose sequences using a pose lifter model. Args: model (nn.Module): The loaded pose lifter model pose_results_2d (List[List[dict]]): The 2D pose sequences stored in a nested list. Each element of the outer list is the 2D pose results of a single frame, and each element of the inner list is the 2D pose of one person, which contains: - "keypoints" (ndarray[K, 2 or 3]): x, y, [score] - "track_id" (int) dataset (str): Dataset name, e.g. 'Body3DH36MDataset' with_track_id: If True, the element in pose_results_2d is expected to contain "track_id", which will be used to gather the pose sequence of a person from multiple frames. Otherwise, the pose results in each frame are expected to have a consistent number and order of identities. Default is True. Returns: List[dict]: 3D pose inference results. Each element is the result of an instance, which contains: - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints - "keypoints" (ndarray[K, 2 or 3]): from the last frame in ``pose_results_2d``. - "track_id" (int): from the last frame in ``pose_results_2d``. If there is no valid instance, an empty list will be returned. """ cfg = model.cfg test_pipeline = Compose(cfg.test_pipeline) flip_pairs = None if dataset == 'Body3DH36MDataset': flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]] else: raise NotImplementedError() pose_sequences_2d = _collate_pose_sequence(pose_results_2d, with_track_id) if not pose_sequences_2d: return [] batch_data = [] for seq in pose_sequences_2d: pose_2d = seq['keypoints'].astype(np.float32) T, K, C = pose_2d.shape input_2d = pose_2d[..., :2] input_2d_visible = pose_2d[..., 2:3] if C > 2: input_2d_visible = pose_2d[..., 2:3] else: input_2d_visible = np.ones((T, K, 1), dtype=np.float32) # Dummy 3D input # This is for compatibility with configs in mmpose<=v0.14.0, where a # 3D input is required to generate denormalization parameters. This # part will be removed in the future. target = np.zeros((K, 3), dtype=np.float32) target_visible = np.ones((K, 1), dtype=np.float32) # Dummy image path # This is for compatibility with configs in mmpose<=v0.14.0, where # target_image_path is required. This part will be removed in the # future. target_image_path = None data = { 'input_2d': input_2d, 'input_2d_visible': input_2d_visible, 'target': target, 'target_visible': target_visible, 'target_image_path': target_image_path, 'ann_info': { 'num_joints': K, 'flip_pairs': flip_pairs } } data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) if next(model.parameters()).is_cuda: device = next(model.parameters()).device batch_data = scatter(batch_data, target_gpus=[device.index])[0] else: batch_data = scatter(batch_data, target_gpus=[-1])[0] with torch.no_grad(): result = model(input=batch_data['input'], metas=batch_data['metas'], return_loss=False) poses_3d = result['preds'] if poses_3d.shape[-1] != 4: assert poses_3d.shape[-1] == 3 dummy_score = np.ones(poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype) poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1) pose_results = [] for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d): pose_result = pose_2d.copy() pose_result['keypoints_3d'] = pose_3d pose_results.append(pose_result) return pose_results
def inference_bottom_up_pose_model(model, img_or_path, return_heatmap=False, outputs=None): """Inference a single image. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. image_name (str| np.ndarray): Image_name. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[ndarray]: The predicted pose info. The length of the list is the number of people (P). Each item in the list is a ndarray, containing each person's pose (ndarray[Kx3]): x, y, score. list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ pose_results = [] returned_outputs = [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline test_pipeline = [LoadImage()] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = { 'img_or_path': img_or_path, 'dataset': 'coco', 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # forward the model with torch.no_grad(): all_preds, _, _, heatmap = model(img=data['img'], img_metas=data['img_metas'], return_loss=False, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = heatmap returned_outputs.append(h.layer_outputs) for pred in all_preds: pose_results.append({ 'keypoints': pred[:, :3], }) return pose_results, returned_outputs
def test_joint_transforms(): results = get_data_sample() mean = np.random.rand(16, 3).astype(np.float32) std = np.random.rand(16, 3).astype(np.float32) + 1e-6 pipeline = [ dict(type='RelativeJointRandomFlip', item='target', flip_cfg=dict(center_mode='root', center_index=0), visible_item='target_visible', flip_prob=1., flip_camera=True), dict(type='GetRootCenteredPose', item='target', root_index=0, root_name='global_position', remove_root=True), dict(type='NormalizeJointCoordinate', item='target', mean=mean, std=std), dict(type='PoseSequenceToTensor', item='target'), dict(type='ImageCoordinateNormalization', item='input_2d', norm_camera=True), dict(type='CollectCameraIntrinsics'), dict(type='Collect', keys=[('input_2d', 'input'), ('target', 'output'), 'flip_pairs', 'intrinsics'], meta_name='metas', meta_keys=['camera_param']) ] pipeline = Compose(pipeline) output = pipeline(copy.deepcopy(results)) # test transformation of target joints_0 = results['target'] joints_1 = output['output'].numpy() # manually do transformations flip_pairs = output['flip_pairs'] _joints_0_flipped = joints_0.copy() for _l, _r in flip_pairs: _joints_0_flipped[..., _l, :] = joints_0[..., _r, :] _joints_0_flipped[..., _r, :] = joints_0[..., _l, :] _joints_0_flipped[..., 0] = 2 * joints_0[..., 0:1, 0] - _joints_0_flipped[..., 0] joints_0 = _joints_0_flipped joints_0 = (joints_0[..., 1:, :] - joints_0[..., 0:1, :] - mean) / std # convert to [K*C, T] joints_0 = joints_0.reshape(-1)[..., None] np.testing.assert_array_almost_equal(joints_0, joints_1) # test transformation of input joints_0 = results['input_2d'] joints_1 = output['input'] # manually do transformations center = np.array( [0.5 * results['image_width'], 0.5 * results['image_height']], dtype=np.float32) scale = np.array(0.5 * results['image_width'], dtype=np.float32) joints_0 = (joints_0 - center) / scale np.testing.assert_array_almost_equal(joints_0, joints_1) # test transformation of camera parameters camera_param_0 = results['camera_param'] camera_param_1 = output['metas'].data['camera_param'] # manually flip and normalization camera_param_0['c'][0] *= -1 camera_param_0['p'][0] *= -1 camera_param_0['c'] = (camera_param_0['c'] - np.array(center)[:, None]) / scale camera_param_0['f'] = camera_param_0['f'] / scale np.testing.assert_array_almost_equal(camera_param_0['c'], camera_param_1['c']) np.testing.assert_array_almost_equal(camera_param_0['f'], camera_param_1['f']) # test CollectCameraIntrinsics intrinsics_0 = np.concatenate([ results['camera_param']['f'].reshape(2), results['camera_param']['c'].reshape(2), results['camera_param']['k'].reshape(3), results['camera_param']['p'].reshape(2) ]) intrinsics_1 = output['intrinsics'] np.testing.assert_array_almost_equal(intrinsics_0, intrinsics_1) # test load mean/std from file with tempfile.TemporaryDirectory() as tmpdir: norm_param = {'mean': mean, 'std': std} norm_param_file = osp.join(tmpdir, 'norm_param.pkl') mmcv.dump(norm_param, norm_param_file) pipeline = [ dict(type='NormalizeJointCoordinate', item='target', norm_param_file=norm_param_file), ] pipeline = Compose(pipeline)
def inference_interhand_3d_model(model, img_or_path, det_results, bbox_thr=None, format='xywh', dataset='InterHand3DDataset'): """Inference a single image with a list of hand bounding boxes. num_bboxes: N num_keypoints: K Args: model (nn.Module): The loaded pose model. img_or_path (str | np.ndarray): Image filename or loaded image. det_results (List[dict]): The 2D bbox sequences stored in a list. Each each element of the list is the bbox of one person, which contains: - "bbox" (ndarray[4 or 5]): The person bounding box, which contains 4 box coordinates (and score). dataset (str): Dataset name. format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. 'xyxy' means (left, top, right, bottom), 'xywh' means (left, top, width, height). Returns: List[dict]: 3D pose inference results. Each element is the result of an instance, which contains: - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints If there is no valid instance, an empty list will be returned. """ assert format in ['xyxy', 'xywh'] pose_results = [] if len(det_results) == 0: return pose_results # Change for-loop preprocess each bbox to preprocess all bboxes at once. bboxes = np.array([box['bbox'] for box in det_results]) # Select bboxes by score threshold if bbox_thr is not None: assert bboxes.shape[1] == 5 valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] bboxes = bboxes[valid_idx] det_results = [det_results[i] for i in valid_idx] if format == 'xyxy': bboxes_xyxy = bboxes bboxes_xywh = _xyxy2xywh(bboxes) else: # format is already 'xywh' bboxes_xywh = bboxes bboxes_xyxy = _xywh2xyxy(bboxes) # if bbox_thr remove all bounding box if len(bboxes_xywh) == 0: return [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) assert len(bboxes[0]) in [4, 5] if dataset == 'InterHand3DDataset': flip_pairs = [[i, 21 + i] for i in range(21)] else: raise NotImplementedError() batch_data = [] for bbox in bboxes: center, scale = _box2cs(cfg, bbox) # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'bbox_id': 0, # need to be assigned if batch_size > 1 'dataset': dataset, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs, 'heatmap3d_depth_bound': cfg.data_cfg['heatmap3d_depth_bound'], 'heatmap_size_root': cfg.data_cfg['heatmap_size_root'], 'root_depth_bound': cfg.data_cfg['root_depth_bound'] } } data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter not work so just move image to cuda device batch_data['img'] = batch_data['img'].to(device) # get all img_metas of each bounding box batch_data['img_metas'] = [ img_metas[0] for img_metas in batch_data['img_metas'].data ] # forward the model with torch.no_grad(): result = model(img=batch_data['img'], img_metas=batch_data['img_metas'], return_loss=False) poses_3d = result['preds'] rel_root_depth = result['rel_root_depth'] hand_type = result['hand_type'] if poses_3d.shape[-1] != 4: assert poses_3d.shape[-1] == 3 dummy_score = np.ones(poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype) poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1) # add relative root depth to left hand joints poses_3d[:, 21:, 2] += rel_root_depth # set joint scores according to hand type poses_3d[:, :21, 3] *= hand_type[:, [0]] poses_3d[:, 21:, 3] *= hand_type[:, [1]] pose_results = [] for pose_3d, person_res, bbox_xyxy in zip(poses_3d, det_results, bboxes_xyxy): pose_res = person_res.copy() pose_res['keypoints_3d'] = pose_3d pose_res['bbox'] = bbox_xyxy pose_results.append(pose_res) return pose_results
def inference_bottom_up_pose_model(model, img_or_path, pose_nms_thr=0.9, return_heatmap=False, outputs=None): """Inference a single image. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9. return_heatmap (bool) : Flag to return heatmap, default: False. outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None. Returns: list[ndarray]: The predicted pose info. The length of the list is the number of people (P). Each item in the list is a ndarray, containing each person's pose (ndarray[Kx3]): x, y, score. list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ pose_results = [] returned_outputs = [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = { 'img_or_path': img_or_path, 'dataset': 'coco', 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # forward the model with torch.no_grad(): result = model( img=data['img'], img_metas=data['img_metas'], return_loss=False, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = result['output_heatmap'] returned_outputs.append(h.layer_outputs) for idx, pred in enumerate(result['preds']): area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * ( np.max(pred[:, 1]) - np.min(pred[:, 1])) pose_results.append({ 'keypoints': pred[:, :3], 'score': result['scores'][idx], 'area': area, }) # pose nms keep = oks_nms(pose_results, pose_nms_thr, sigmas=None) pose_results = [pose_results[_keep] for _keep in keep] return pose_results, returned_outputs
def process_model(model, dataset, person_results, img_or_path): bboxes = np.array([box['bbox'] for box in person_results]) cfg = model.cfg flip_pairs = None device = next(model.parameters()).device channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset', 'AnimalMacaqueDataset'): flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] elif dataset == 'TopDownCocoWholeBodyDataset': body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] foot = [[17, 20], [18, 21], [19, 22]] face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], [111, 132]] flip_pairs = body + foot + face + hand elif dataset == 'TopDownAicDataset': flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] elif dataset == 'TopDownMpiiDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] elif dataset == 'TopDownMpiiTrbDataset': flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23], [29, 35], [18, 24], [30, 36], [19, 25], [31, 37], [20, 26], [32, 38], [21, 27], [33, 39]] elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset', 'InterHand2DDataset'): flip_pairs = [] elif dataset == 'Face300WDataset': flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23], [21, 22], [31, 35], [32, 34], [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46], [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]] elif dataset == 'FaceAFLWDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], [12, 14], [15, 17]] elif dataset == 'FaceCOFWDataset': flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]] elif dataset == 'FaceWFLWDataset': flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50], [39, 49], [40, 48], [41, 47], [60, 72], [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], [66, 74], [67, 73], [55, 59], [56, 58], [76, 82], [77, 81], [78, 80], [87, 83], [86, 84], [88, 92], [89, 91], [95, 93], [96, 97]] elif dataset == 'AnimalFlyDataset': flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]] elif dataset == 'AnimalHorse10Dataset': flip_pairs = [] elif dataset == 'AnimalLocustDataset': flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]] elif dataset == 'AnimalZebraDataset': flip_pairs = [[3, 4], [5, 6]] elif dataset == 'AnimalPoseDataset': flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17], [18, 19]] else: raise NotImplementedError() batch_data = [] for bbox in bboxes: center, scale = _box2cs(cfg, bbox) # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'bbox_id': 0, # need to be assigned if batch_size > 1 'dataset': dataset, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs } } data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter not work so just move image to cuda device batch_data['img'] = batch_data['img'].to(device) # get all img_metas of each bounding box batch_data['img_metas'] = [ img_metas[0] for img_metas in batch_data['img_metas'].data ] #torch_data = tr.tensor(input_data) #tran = transforms.ToTensor() #torch_data = tran(input_data).unsqueeze(0) #torch_data = torch_data.to(device) #print(batch_data['img']) #print(" ") #print(input_data) #print(" ") #print(torch_data - batch_data['img']) with tr.no_grad(): result = model( img=batch_data['img'], #img = torch_data, img_metas=batch_data['img_metas'], return_loss=False, return_heatmap=False) return result['preds'], result['output_heatmap']
def inference_mesh_model(model, img_or_path, det_results, bbox_thr=None, format='xywh', dataset='MeshH36MDataset'): """Inference a single image with a list of bounding boxes. Note: - num_bboxes: N - num_keypoints: K - num_vertices: V - num_faces: F Args: model (nn.Module): The loaded pose model. img_or_path (str | np.ndarray): Image filename or loaded image. det_results (list[dict]): The 2D bbox sequences stored in a list. Each element of the list is the bbox of one person. "bbox" (ndarray[4 or 5]): The person bounding box, which contains 4 box coordinates (and score). bbox_thr (float | None): Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, all boxes will be used. format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'. - 'xyxy' means (left, top, right, bottom), - 'xywh' means (left, top, width, height). dataset (str): Dataset name. Returns: list[dict]: 3D pose inference results. Each element \ is the result of an instance, which contains: - 'bbox' (ndarray[4]): instance bounding bbox - 'center' (ndarray[2]): bbox center - 'scale' (ndarray[2]): bbox scale - 'keypoints_3d' (ndarray[K,3]): predicted 3D keypoints - 'camera' (ndarray[3]): camera parameters - 'vertices' (ndarray[V, 3]): predicted 3D vertices - 'faces' (ndarray[F, 3]): mesh faces If there is no valid instance, an empty list will be returned. """ assert format in ['xyxy', 'xywh'] pose_results = [] if len(det_results) == 0: return pose_results # Change for-loop preprocess each bbox to preprocess all bboxes at once. bboxes = np.array([box['bbox'] for box in det_results]) # Select bboxes by score threshold if bbox_thr is not None: assert bboxes.shape[1] == 5 valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] bboxes = bboxes[valid_idx] det_results = [det_results[i] for i in valid_idx] if format == 'xyxy': bboxes_xyxy = bboxes bboxes_xywh = _xyxy2xywh(bboxes) else: # format is already 'xywh' bboxes_xywh = bboxes bboxes_xyxy = _xywh2xyxy(bboxes) # if bbox_thr remove all bounding box if len(bboxes_xywh) == 0: return [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) assert len(bboxes[0]) in [4, 5] if dataset == 'MeshH36MDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], [20, 21], [22, 23]] else: raise NotImplementedError() batch_data = [] for bbox in bboxes: center, scale = _box2cs(cfg, bbox) # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'rotation': 0, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'dataset': dataset, 'joints_2d': np.zeros((cfg.data_cfg.num_joints, 2), dtype=np.float32), 'joints_2d_visible': np.zeros((cfg.data_cfg.num_joints, 1), dtype=np.float32), 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'pose': np.zeros(72, dtype=np.float32), 'beta': np.zeros(10, dtype=np.float32), 'has_smpl': 0, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs, } } data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter not work so just move image to cuda device batch_data['img'] = batch_data['img'].to(device) # get all img_metas of each bounding box batch_data['img_metas'] = [ img_metas[0] for img_metas in batch_data['img_metas'].data ] # forward the model with torch.no_grad(): preds = model( img=batch_data['img'], img_metas=batch_data['img_metas'], return_loss=False, return_vertices=True, return_faces=True) for idx in range(len(det_results)): pose_res = det_results[idx].copy() pose_res['bbox'] = bboxes_xyxy[idx] pose_res['center'] = batch_data['img_metas'][idx]['center'] pose_res['scale'] = batch_data['img_metas'][idx]['scale'] pose_res['keypoints_3d'] = preds['keypoints_3d'][idx] pose_res['camera'] = preds['camera'][idx] pose_res['vertices'] = preds['vertices'][idx] pose_res['faces'] = preds['faces'] pose_results.append(pose_res) return pose_results
def _inference_single_pose_model(model, img_or_path, bboxes, dataset, return_heatmap=False): """Inference a single bbox. num_keypoints: K Args: model (nn.Module): The loaded pose model. img_or_path (str | np.ndarray): Image filename or loaded image. bboxes (list | np.ndarray): All bounding boxes (with scores), shaped (N, 4) or (N, 5). (left, top, width, height, [score]) where N is number of bounding boxes. dataset (str): Dataset name. outputs (list[str] | tuple[str]): Names of layers whose output is to be returned, default: None Returns: ndarray[Kx3]: Predicted pose x, y, score. heatmap[N, K, H, W]: Model output heatmap. """ cfg = model.cfg device = next(model.parameters()).device # build the data pipeline channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) assert len(bboxes[0]) in [4, 5] flip_pairs = None if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset', 'AnimalMacaqueDataset'): flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] elif dataset == 'TopDownCocoWholeBodyDataset': body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] foot = [[17, 20], [18, 21], [19, 22]] face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], [111, 132]] flip_pairs = body + foot + face + hand elif dataset == 'TopDownAicDataset': flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] elif dataset == 'TopDownMpiiDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] elif dataset == 'TopDownMpiiTrbDataset': flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23], [29, 35], [18, 24], [30, 36], [19, 25], [31, 37], [20, 26], [32, 38], [21, 27], [33, 39]] elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset', 'InterHand2DDataset'): flip_pairs = [] elif dataset in 'Face300WDataset': flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23], [21, 22], [31, 35], [32, 34], [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46], [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]] elif dataset in 'FaceAFLWDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], [12, 14], [15, 17]] elif dataset in 'FaceCOFWDataset': flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]] elif dataset in 'FaceWFLWDataset': flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50], [39, 49], [40, 48], [41, 47], [60, 72], [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], [66, 74], [67, 73], [55, 59], [56, 58], [76, 82], [77, 81], [78, 80], [87, 83], [86, 84], [88, 92], [89, 91], [95, 93], [96, 97]] elif dataset in 'AnimalFlyDataset': flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]] elif dataset in 'AnimalHorse10Dataset': flip_pairs = [] elif dataset in 'AnimalLocustDataset': flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]] elif dataset in 'AnimalZebraDataset': flip_pairs = [[3, 4], [5, 6]] elif dataset in 'AnimalPoseDataset': flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17], [18, 19]] else: raise NotImplementedError() batch_data = [] for bbox in bboxes: center, scale = _box2cs(cfg, bbox) # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'bbox_id': 0, # need to be assigned if batch_size > 1 'dataset': dataset, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs } } data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter not work so just move image to cuda device batch_data['img'] = batch_data['img'].to(device) # get all img_metas of each bounding box batch_data['img_metas'] = [ img_metas[0] for img_metas in batch_data['img_metas'].data ] # forward the model with torch.no_grad(): result = model( img=batch_data['img'], img_metas=batch_data['img_metas'], return_loss=False, return_heatmap=return_heatmap) return result['preds'], result['output_heatmap']
def inference_pose_lifter_model(model, pose_results_2d, dataset=None, dataset_info=None, with_track_id=True, image_size=None, norm_pose_2d=False): """Inference 3D pose from 2D pose sequences using a pose lifter model. Args: model (nn.Module): The loaded pose lifter model pose_results_2d (list[list[dict]]): The 2D pose sequences stored in a nested list. Each element of the outer list is the 2D pose results of a single frame, and each element of the inner list is the 2D pose of one person, which contains: - "keypoints" (ndarray[K, 2 or 3]): x, y, [score] - "track_id" (int) dataset (str): Dataset name, e.g. 'Body3DH36MDataset' with_track_id: If True, the element in pose_results_2d is expected to contain "track_id", which will be used to gather the pose sequence of a person from multiple frames. Otherwise, the pose results in each frame are expected to have a consistent number and order of identities. Default is True. image_size (tuple|list): image width, image height. If None, image size will not be contained in dict ``data``. norm_pose_2d (bool): If True, scale the bbox (along with the 2D pose) to the average bbox scale of the dataset, and move the bbox (along with the 2D pose) to the average bbox center of the dataset. Returns: list[dict]: 3D pose inference results. Each element is the result of \ an instance, which contains: - "keypoints_3d" (ndarray[K, 3]): predicted 3D keypoints - "keypoints" (ndarray[K, 2 or 3]): from the last frame in \ ``pose_results_2d``. - "track_id" (int): from the last frame in ``pose_results_2d``. \ If there is no valid instance, an empty list will be \ returned. """ cfg = model.cfg test_pipeline = Compose(cfg.test_pipeline) if dataset_info is not None: flip_pairs = dataset_info.flip_pairs assert 'stats_info' in dataset_info._dataset_info bbox_center = dataset_info._dataset_info['stats_info']['bbox_center'] bbox_scale = dataset_info._dataset_info['stats_info']['bbox_scale'] else: warnings.warn( 'dataset is deprecated.' 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) # TODO: These will be removed in the later versions. if dataset == 'Body3DH36MDataset': flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]] bbox_center = np.array([[528, 427]], dtype=np.float32) bbox_scale = 400 else: raise NotImplementedError() target_idx = -1 if model.causal else len(pose_results_2d) // 2 pose_lifter_inputs = _gather_pose_lifter_inputs(pose_results_2d, bbox_center, bbox_scale, norm_pose_2d) pose_sequences_2d = _collate_pose_sequence(pose_lifter_inputs, with_track_id, target_idx) if not pose_sequences_2d: return [] batch_data = [] for seq in pose_sequences_2d: pose_2d = seq['keypoints'].astype(np.float32) T, K, C = pose_2d.shape input_2d = pose_2d[..., :2] input_2d_visible = pose_2d[..., 2:3] if C > 2: input_2d_visible = pose_2d[..., 2:3] else: input_2d_visible = np.ones((T, K, 1), dtype=np.float32) # TODO: Will be removed in the later versions # Dummy 3D input # This is for compatibility with configs in mmpose<=v0.14.0, where a # 3D input is required to generate denormalization parameters. This # part will be removed in the future. target = np.zeros((K, 3), dtype=np.float32) target_visible = np.ones((K, 1), dtype=np.float32) # Dummy image path # This is for compatibility with configs in mmpose<=v0.14.0, where # target_image_path is required. This part will be removed in the # future. target_image_path = None data = { 'input_2d': input_2d, 'input_2d_visible': input_2d_visible, 'target': target, 'target_visible': target_visible, 'target_image_path': target_image_path, 'ann_info': { 'num_joints': K, 'flip_pairs': flip_pairs } } if image_size is not None: assert len(image_size) == 2 data['image_width'] = image_size[0] data['image_height'] = image_size[1] data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) if next(model.parameters()).is_cuda: device = next(model.parameters()).device batch_data = scatter(batch_data, target_gpus=[device.index])[0] else: batch_data = scatter(batch_data, target_gpus=[-1])[0] with torch.no_grad(): result = model( input=batch_data['input'], metas=batch_data['metas'], return_loss=False) poses_3d = result['preds'] if poses_3d.shape[-1] != 4: assert poses_3d.shape[-1] == 3 dummy_score = np.ones( poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype) poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1) pose_results = [] for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d): pose_result = pose_2d.copy() pose_result['keypoints_3d'] = pose_3d pose_results.append(pose_result) return pose_results
def _inference_single_pose_model(model, imgs_or_paths, bboxes, dataset='TopDownCocoDataset', dataset_info=None, return_heatmap=False, use_multi_frames=False): """Inference human bounding boxes. Note: - num_frames: F - num_bboxes: N - num_keypoints: K Args: model (nn.Module): The loaded pose model. imgs_or_paths (list(str) | list(np.ndarray)): Image filename(s) or loaded image(s) bboxes (list | np.ndarray): All bounding boxes (with scores), shaped (N, 4) or (N, 5). (left, top, width, height, [score]) where N is number of bounding boxes. dataset (str): Dataset name. Deprecated. dataset_info (DatasetInfo): A class containing all dataset info. return_heatmap (bool): Flag to return heatmap, default: False use_multi_frames (bool): Flag to use multi frames for inference Returns: ndarray[NxKx3]: Predicted pose x, y, score. heatmap[N, K, H, W]: Model output heatmap. """ cfg = model.cfg device = next(model.parameters()).device if device.type == 'cpu': device = -1 if use_multi_frames: assert 'frame_weight_test' in cfg.data.test.data_cfg # use multi frames for inference # the number of input frames must equal to frame weight in the config assert len(imgs_or_paths) == len( cfg.data.test.data_cfg.frame_weight_test) # build the data pipeline _test_pipeline = copy.deepcopy(cfg.test_pipeline) has_bbox_xywh2cs = False for transform in _test_pipeline: if transform['type'] == 'TopDownGetBboxCenterScale': has_bbox_xywh2cs = True break if not has_bbox_xywh2cs: _test_pipeline.insert( 0, dict(type='TopDownGetBboxCenterScale', padding=1.25)) test_pipeline = Compose(_test_pipeline) _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device) assert len(bboxes[0]) in [4, 5] if dataset_info is not None: dataset_name = dataset_info.dataset_name flip_pairs = dataset_info.flip_pairs else: warnings.warn( 'dataset is deprecated.' 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) # TODO: These will be removed in the later versions. if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset', 'AnimalMacaqueDataset'): flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] elif dataset == 'TopDownCocoWholeBodyDataset': body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] foot = [[17, 20], [18, 21], [19, 22]] face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], [111, 132]] flip_pairs = body + foot + face + hand elif dataset == 'TopDownAicDataset': flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] elif dataset == 'TopDownMpiiDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] elif dataset == 'TopDownMpiiTrbDataset': flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23], [29, 35], [18, 24], [30, 36], [19, 25], [31, 37], [20, 26], [32, 38], [21, 27], [33, 39]] elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset', 'InterHand2DDataset'): flip_pairs = [] elif dataset in 'Face300WDataset': flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23], [21, 22], [31, 35], [32, 34], [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46], [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]] elif dataset in 'FaceAFLWDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], [12, 14], [15, 17]] elif dataset in 'FaceCOFWDataset': flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]] elif dataset in 'FaceWFLWDataset': flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50], [39, 49], [40, 48], [41, 47], [60, 72], [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], [66, 74], [67, 73], [55, 59], [56, 58], [76, 82], [77, 81], [78, 80], [87, 83], [86, 84], [88, 92], [89, 91], [95, 93], [96, 97]] elif dataset in 'AnimalFlyDataset': flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]] elif dataset in 'AnimalHorse10Dataset': flip_pairs = [] elif dataset in 'AnimalLocustDataset': flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]] elif dataset in 'AnimalZebraDataset': flip_pairs = [[3, 4], [5, 6]] elif dataset in 'AnimalPoseDataset': flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17], [18, 19]] else: raise NotImplementedError() dataset_name = dataset batch_data = [] for bbox in bboxes: # prepare data data = { 'bbox': bbox, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'bbox_id': 0, # need to be assigned if batch_size > 1 'dataset': dataset_name, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs } } if use_multi_frames: # weight for different frames in multi-frame inference setting data['frame_weight'] = cfg.data.test.data_cfg.frame_weight_test if isinstance(imgs_or_paths[0], np.ndarray): data['img'] = imgs_or_paths else: data['image_file'] = imgs_or_paths else: if isinstance(imgs_or_paths, np.ndarray): data['img'] = imgs_or_paths else: data['image_file'] = imgs_or_paths data = test_pipeline(data) batch_data.append(data) batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) batch_data = scatter(batch_data, [device])[0] # forward the model with torch.no_grad(): result = model(img=batch_data['img'], img_metas=batch_data['img_metas'], return_loss=False, return_heatmap=return_heatmap) return result['preds'], result['output_heatmap']
def inference_bottom_up_pose_model(model, img_or_path, dataset='BottomUpCocoDataset', dataset_info=None, pose_nms_thr=0.9, return_heatmap=False, outputs=None): """Inference a single image with a bottom-up pose model. Note: - num_people: P - num_keypoints: K - bbox height: H - bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'. It is deprecated. Please use dataset_info instead. dataset_info (DatasetInfo): A class containing all dataset info. pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9. return_heatmap (bool) : Flag to return heatmap, default: False. outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None. Returns: tuple: - pose_results (list[np.ndarray]): The predicted pose info. \ The length of the list is the number of people (P). \ Each item in the list is a ndarray, containing each \ person's pose (np.ndarray[Kx3]): x, y, score. - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \ torch.Tensor[N, K, H, W]]]): \ Output feature maps from layers specified in `outputs`. \ Includes 'heatmap' if `return_heatmap` is True. """ # get dataset info if (dataset_info is None and hasattr(model, 'cfg') and 'dataset_info' in model.cfg): dataset_info = DatasetInfo(model.cfg.dataset_info) if dataset_info is not None: dataset_name = dataset_info.dataset_name flip_index = dataset_info.flip_index sigmas = getattr(dataset_info, 'sigmas', None) else: warnings.warn( 'dataset is deprecated.' 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) assert (dataset == 'BottomUpCocoDataset') dataset_name = dataset flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] sigmas = None pose_results = [] returned_outputs = [] cfg = model.cfg device = next(model.parameters()).device if device.type == 'cpu': device = -1 # build the data pipeline test_pipeline = Compose(cfg.test_pipeline) _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device) # prepare data data = { 'dataset': dataset_name, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': flip_index, } } if isinstance(img_or_path, np.ndarray): data['img'] = img_or_path else: data['image_file'] = img_or_path data = test_pipeline(data) data = collate([data], samples_per_gpu=1) data = scatter(data, [device])[0] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # forward the model with torch.no_grad(): result = model(img=data['img'], img_metas=data['img_metas'], return_loss=False, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = result['output_heatmap'] returned_outputs.append(h.layer_outputs) for idx, pred in enumerate(result['preds']): area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * ( np.max(pred[:, 1]) - np.min(pred[:, 1])) pose_results.append({ 'keypoints': pred[:, :3], 'score': result['scores'][idx], 'area': area, }) # pose nms score_per_joint = cfg.model.test_cfg.get('score_per_joint', False) keep = oks_nms(pose_results, pose_nms_thr, sigmas, score_per_joint=score_per_joint) pose_results = [pose_results[_keep] for _keep in keep] return pose_results, returned_outputs
def _inference_single_pose_model(model, img_or_path, bbox): """Inference a single bbox. num_keypoints: K Args: model (nn.Module): The loaded pose model. image_name (str | np.ndarray):Image_name bbox (list | np.ndarray): Bounding boxes (with scores), shaped (4, ) or (5, ). (left, top, width, height, [score]) Returns: ndarray[Kx3]: Predicted pose x, y, score. """ cfg = model.cfg device = next(model.parameters()).device # build the data pipeline test_pipeline = [LoadImage()] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) assert len(bbox) in [4, 5] center, scale = _box2cs(cfg, bbox) # prepare data data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'dataset': 'coco', 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] # forward the model with torch.no_grad(): all_preds, _, _ = model(return_loss=False, img=data['img'], img_metas=data['img_metas']) return all_preds[0]
def test_camera_projection(): results = get_data_sample() pipeline_1 = [ dict(type='CameraProjection', item='input_3d', output_name='input_3d_w', camera_type='SimpleCamera', mode='camera_to_world'), dict(type='CameraProjection', item='input_3d_w', output_name='input_3d_wp', camera_type='SimpleCamera', mode='world_to_pixel'), dict(type='CameraProjection', item='input_3d', output_name='input_3d_p', camera_type='SimpleCamera', mode='camera_to_pixel'), dict(type='Collect', keys=['input_3d_wp', 'input_3d_p'], meta_keys=[]) ] camera_param = results['camera_param'].copy() camera_param['K'] = np.concatenate( (np.diagflat(camera_param['f']), camera_param['c']), axis=-1) pipeline_2 = [ dict(type='CameraProjection', item='input_3d', output_name='input_3d_w', camera_type='SimpleCamera', camera_param=camera_param, mode='camera_to_world'), dict(type='CameraProjection', item='input_3d_w', output_name='input_3d_wp', camera_type='SimpleCamera', camera_param=camera_param, mode='world_to_pixel'), dict(type='CameraProjection', item='input_3d', output_name='input_3d_p', camera_type='SimpleCamera', camera_param=camera_param, mode='camera_to_pixel'), dict(type='CameraProjection', item='input_3d_w', output_name='input_3d_wc', camera_type='SimpleCamera', camera_param=camera_param, mode='world_to_camera'), dict(type='Collect', keys=['input_3d_wp', 'input_3d_p', 'input_2d'], meta_keys=[]) ] output1 = Compose(pipeline_1)(results) output2 = Compose(pipeline_2)(results) np.testing.assert_allclose(output1['input_3d_wp'], output1['input_3d_p'], rtol=1e-6) np.testing.assert_allclose(output2['input_3d_wp'], output2['input_3d_p'], rtol=1e-6) np.testing.assert_allclose(output2['input_3d_p'], output2['input_2d'], rtol=1e-3, atol=1e-1) # test invalid camera parameters with pytest.raises(ValueError): # missing intrinsic parameters camera_param_wo_intrinsic = camera_param.copy() camera_param_wo_intrinsic.pop('K') camera_param_wo_intrinsic.pop('f') camera_param_wo_intrinsic.pop('c') _ = Compose([ dict(type='CameraProjection', item='input_3d', camera_type='SimpleCamera', camera_param=camera_param_wo_intrinsic, mode='camera_to_pixel') ]) with pytest.raises(ValueError): # invalid mode _ = Compose([ dict(type='CameraProjection', item='input_3d', camera_type='SimpleCamera', camera_param=camera_param, mode='dummy') ]) # test camera without undistortion camera_param_wo_undistortion = camera_param.copy() camera_param_wo_undistortion.pop('k') camera_param_wo_undistortion.pop('p') _ = Compose([ dict(type='CameraProjection', item='input_3d', camera_type='SimpleCamera', camera_param=camera_param_wo_undistortion, mode='camera_to_pixel') ]) # test pixel to camera transformation camera = SimpleCamera(camera_param_wo_undistortion) kpt_camera = np.random.rand(14, 3) kpt_pixel = camera.camera_to_pixel(kpt_camera) _kpt_camera = camera.pixel_to_camera( np.concatenate([kpt_pixel, kpt_camera[:, [2]]], -1)) assert_array_almost_equal(_kpt_camera, kpt_camera, decimal=4)
def inference_bottom_up_pose_model(model, img_or_path): """Inference a single image. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. image_name (str| np.ndarray): Image_name. Returns: list[ndarray]: The predicted pose info. The length of the list is the number of people (P). Each item in the list is a ndarray, containing each person's pose (ndarray[Kx3]): x, y, score """ pose_results = [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline test_pipeline = [LoadImage()] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = { 'img_or_path': img_or_path, 'dataset': 'coco', 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] # forward the model with torch.no_grad(): all_preds, _, _ = model(return_loss=False, img=data['img'], img_metas=data['img_metas']) for pred in all_preds: pose_results.append({ 'keypoints': pred[:, :3], }) return pose_results
def _inference_single_pose_model(model, img_or_path_video, bbox_video, dataset, return_heatmap=False): """Inference a single bbox. num_keypoints: K Args: model (nn.Module): The loaded pose model. image_name (str | np.ndarray):Image_name bbox (list | np.ndarray): Bounding boxes (with scores), shaped (4, ) or (5, ). (left, top, width, height, [score]) dataset (str): Dataset name. outputs (list[str] | tuple[str]): Names of layers whose output is to be returned, default: None Returns: ndarray[Kx3]: Predicted pose x, y, score. heatmap[N, K, H, W]: Model output heatmap. """ cfg = model.cfg device = next(model.parameters()).device # build the data pipeline test_pipeline = [LoadImage()] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) flip_pairs = None if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset'): flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] elif dataset == 'TopDownCocoWholeBodyDataset': body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] foot = [[17, 20], [18, 21], [19, 22]] face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], [111, 132]] flip_pairs = body + foot + face + hand elif dataset == 'TopDownAicDataset': flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] elif dataset == 'TopDownMpiiDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] elif dataset == 'TopDownMpiiTrbDataset': flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [14, 15], [16, 22], [28, 34], [17, 23], [29, 35], [18, 24], [30, 36], [19, 25], [31, 37], [20, 26], [32, 38], [21, 27], [33, 39]] elif dataset in ('OneHand10KDataset', 'FreiHandDataset', 'PanopticDataset', 'InterHand2DDataset'): flip_pairs = [] elif dataset in 'Face300WDataset': flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], [20, 23], [21, 22], [31, 35], [32, 34], [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46], [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55]] elif dataset in 'FaceAFLWDataset': flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], [12, 14], [15, 17]] elif dataset in 'FaceCOFWDataset': flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]] elif dataset in 'FaceWFLWDataset': flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50], [39, 49], [40, 48], [41, 47], [60, 72], [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], [66, 74], [67, 73], [55, 59], [56, 58], [76, 82], [77, 81], [78, 80], [87, 83], [86, 84], [88, 92], [89, 91], [95, 93], [96, 97]] else: raise NotImplementedError() # prepare data img_list = [] img_metas_list = [] for i in range(len(img_or_path_video)): bbox = bbox_video[i] img_or_path = img_or_path_video[i] assert len(bbox) in [4, 5] center, scale = _box2cs(cfg, bbox) data = { 'img_or_path': img_or_path, 'center': center, 'scale': scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'dataset': dataset, 'joints_3d': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'joints_3d_visible': np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), 'rotation': 0, 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_pairs': flip_pairs } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] data['img_metas'][0]['bbox_id'] = 0 img_list.append(data['img']) img_metas_list += data['img_metas'] # forward the model with torch.no_grad(): result = model(img=torch.cat(img_list, dim=0), img_metas=img_metas_list, return_loss=False, return_heatmap=return_heatmap) return result['preds'], result['output_heatmap']