def inference_top_down_pose_model(model, img_or_path, person_results, bbox_thr=None, format='xywh', dataset='TopDownCocoDataset', return_heatmap=False, outputs=None): """Inference a single image with a list of person bounding boxes. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. person_results (List(dict)): the item in the dict may contain 'bbox' and/or 'track_id'. 'bbox' (4, ) or (5, ): The person bounding box, which contains 4 box coordinates (and score). 'track_id' (int): The unique id for each human instance. bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, ignore it. format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. 'xyxy' means (left, top, right, bottom), 'xywh' means (left, top, width, height). dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[dict]: The bbox & pose info, Each item in the list is a dictionary, containing the bbox: (left, top, right, bottom, [score]) and the pose (ndarray[Kx3]): x, y, score list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ # only two kinds of bbox format is supported. assert format in ['xyxy', 'xywh'] pose_results = [] returned_outputs = [] if len(person_results) == 0: return pose_results, returned_outputs # Change for-loop preprocess each bbox to preprocess all bboxes at once. bboxes = np.array([box['bbox'] for box in person_results]) # Select bboxes by score threshold if bbox_thr is not None: assert bboxes.shape[1] == 5 valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] bboxes = bboxes[valid_idx] person_results = [person_results[i] for i in valid_idx] if format == 'xyxy': bboxes_xyxy = bboxes bboxes_xywh = _xyxy2xywh(bboxes) else: # format is already 'xywh' bboxes_xywh = bboxes bboxes_xyxy = _xywh2xyxy(bboxes) # if bbox_thr remove all bounding box if len(bboxes_xywh) == 0: return [], [] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # poses is results['pred'] # N x 17x 3 poses, heatmap = _inference_single_pose_model( model, img_or_path, bboxes_xywh, dataset, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = heatmap returned_outputs.append(h.layer_outputs) assert len(poses) == len(person_results), print( len(poses), len(person_results), len(bboxes_xyxy)) for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy): pose_result = person_result.copy() pose_result['keypoints'] = pose pose_result['bbox'] = bbox_xyxy pose_results.append(pose_result) return pose_results, returned_outputs
def inference_bottom_up_pose_model(model, img_or_path, return_heatmap=False, outputs=None): """Inference a single image. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[ndarray]: The predicted pose info. The length of the list is the number of people (P). Each item in the list is a ndarray, containing each person's pose (ndarray[Kx3]): x, y, score. list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ pose_results = [] returned_outputs = [] cfg = model.cfg device = next(model.parameters()).device # build the data pipeline channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb') test_pipeline = [LoadImage(channel_order=channel_order) ] + cfg.test_pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = { 'img_or_path': img_or_path, 'dataset': 'coco', 'ann_info': { 'image_size': cfg.data_cfg['image_size'], 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], } } data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: # just get the actual data from DataContainer data['img_metas'] = data['img_metas'].data[0] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # forward the model with torch.no_grad(): result = model( img=data['img'], img_metas=data['img_metas'], return_loss=False, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = result['output_heatmap'] returned_outputs.append(h.layer_outputs) for pred in result['preds']: pose_results.append({ 'keypoints': pred[:, :3], }) return pose_results, returned_outputs
def inference_top_down_pose_model(model, img_or_path, person_bboxes, bbox_thr=None, format='xywh', dataset='TopDownCocoDataset', return_heatmap=False, outputs=None): """Inference a single image with a list of person bounding boxes. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. image_name (str| np.ndarray): Image_name person_bboxes: (np.ndarray[P x 4] or [P x 5]): Each person bounding box shaped (4, ) or (5, ), contains 4 box coordinates (and score). bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, ignore it. format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. 'xyxy' means (left, top, right, bottom), 'xywh' means (left, top, width, height). dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[dict]: The bbox & pose info, Each item in the list is a dictionary, containing the bbox: (left, top, right, bottom, [score]) and the pose (ndarray[Kx3]): x, y, score list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ # only two kinds of bbox format is supported. assert format in ['xyxy', 'xywh'] # transform the bboxes format to xywh if format == 'xyxy': person_bboxes = _xyxy2xywh(np.array(person_bboxes)) pose_results = [] returned_outputs = [] if len(person_bboxes) > 0: if bbox_thr is not None: person_bboxes = person_bboxes[person_bboxes[:, 4] > bbox_thr] with OutputHook(model, outputs=outputs, as_tensor=True) as h: for bbox in person_bboxes: pose, heatmap = _inference_single_pose_model( model, img_or_path, bbox, dataset, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = heatmap returned_outputs.append(h.layer_outputs) pose_results.append({ 'bbox': _xywh2xyxy(np.expand_dims(np.array(bbox), 0)), 'keypoints': pose }) return pose_results, returned_outputs
def inference_top_down_pose_model(model, img_or_path, person_results, bbox_thr=None, format='xywh', dataset='TopDownCocoDataset', return_heatmap=False, outputs=None): """Inference a single image with a list of person bounding boxes. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. person_results (List(dict)): the item in the dict may contain 'bbox' and/or 'track_id'. 'bbox' (4, ) or (5, ): The person bounding box, which contains 4 box coordinates (and score). 'track_id' (int): The unique id for each human instance. bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, ignore it. format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. 'xyxy' means (left, top, right, bottom), 'xywh' means (left, top, width, height). dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[dict]: The bbox & pose info, Each item in the list is a dictionary, containing the bbox: (left, top, right, bottom, [score]) and the pose (ndarray[Kx3]): x, y, score list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ # only two kinds of bbox format is supported. assert format in ['xyxy', 'xywh'] pose_results = [] returned_outputs = [] with OutputHook(model, outputs=outputs, as_tensor=False) as h: for person_result in person_results: if format == 'xyxy': bbox_xyxy = np.expand_dims(np.array(person_result['bbox']), 0) bbox_xywh = _xyxy2xywh(bbox_xyxy) else: bbox_xywh = np.expand_dims(np.array(person_result['bbox']), 0) bbox_xyxy = _xywh2xyxy(bbox_xywh) if bbox_thr is not None: assert bbox_xywh.shape[1] == 5 if bbox_xywh[0, 4] < bbox_thr: continue pose, heatmap = _inference_single_pose_model( model, img_or_path, bbox_xywh[0], dataset, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = heatmap returned_outputs.append(h.layer_outputs) person_result['keypoints'] = pose if format == 'xywh': person_result['bbox'] = bbox_xyxy[0] pose_results.append(person_result) return pose_results, returned_outputs
def inference_bottom_up_pose_model(model, img_or_path, dataset='BottomUpCocoDataset', dataset_info=None, pose_nms_thr=0.9, return_heatmap=False, outputs=None): """Inference a single image with a bottom-up pose model. Note: - num_people: P - num_keypoints: K - bbox height: H - bbox width: W Args: model (nn.Module): The loaded pose model. img_or_path (str| np.ndarray): Image filename or loaded image. dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'. It is deprecated. Please use dataset_info instead. dataset_info (DatasetInfo): A class containing all dataset info. pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9. return_heatmap (bool) : Flag to return heatmap, default: False. outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None. Returns: tuple: - pose_results (list[np.ndarray]): The predicted pose info. \ The length of the list is the number of people (P). \ Each item in the list is a ndarray, containing each \ person's pose (np.ndarray[Kx3]): x, y, score. - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \ torch.Tensor[N, K, H, W]]]): \ Output feature maps from layers specified in `outputs`. \ Includes 'heatmap' if `return_heatmap` is True. """ # get dataset info if (dataset_info is None and hasattr(model, 'cfg') and 'dataset_info' in model.cfg): dataset_info = DatasetInfo(model.cfg.dataset_info) if dataset_info is not None: dataset_name = dataset_info.dataset_name flip_index = dataset_info.flip_index sigmas = getattr(dataset_info, 'sigmas', None) else: warnings.warn( 'dataset is deprecated.' 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', DeprecationWarning) assert (dataset == 'BottomUpCocoDataset') dataset_name = dataset flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] sigmas = None pose_results = [] returned_outputs = [] cfg = model.cfg device = next(model.parameters()).device if device.type == 'cpu': device = -1 # build the data pipeline test_pipeline = Compose(cfg.test_pipeline) _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device) # prepare data data = { 'dataset': dataset_name, 'ann_info': { 'image_size': np.array(cfg.data_cfg['image_size']), 'num_joints': cfg.data_cfg['num_joints'], 'flip_index': flip_index, } } if isinstance(img_or_path, np.ndarray): data['img'] = img_or_path else: data['image_file'] = img_or_path data = test_pipeline(data) data = collate([data], samples_per_gpu=1) data = scatter(data, [device])[0] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # forward the model with torch.no_grad(): result = model(img=data['img'], img_metas=data['img_metas'], return_loss=False, return_heatmap=return_heatmap) if return_heatmap: h.layer_outputs['heatmap'] = result['output_heatmap'] returned_outputs.append(h.layer_outputs) for idx, pred in enumerate(result['preds']): area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * ( np.max(pred[:, 1]) - np.min(pred[:, 1])) pose_results.append({ 'keypoints': pred[:, :3], 'score': result['scores'][idx], 'area': area, }) # pose nms score_per_joint = cfg.model.test_cfg.get('score_per_joint', False) keep = oks_nms(pose_results, pose_nms_thr, sigmas, score_per_joint=score_per_joint) pose_results = [pose_results[_keep] for _keep in keep] return pose_results, returned_outputs
def inference_top_down_pose_model(model, imgs_or_paths, person_results=None, bbox_thr=None, format='xywh', dataset='TopDownCocoDataset', dataset_info=None, return_heatmap=False, outputs=None): """Inference a single image with a list of person bounding boxes. Support single-frame and multi-frame inference setting. Note: - num_frames: F - num_people: P - num_keypoints: K - bbox height: H - bbox width: W Args: model (nn.Module): The loaded pose model. imgs_or_paths (str | np.ndarray | list(str) | list(np.ndarray)): Image filename(s) or loaded image(s). person_results (list(dict), optional): a list of detected persons that contains ``bbox`` and/or ``track_id``: - ``bbox`` (4, ) or (5, ): The person bounding box, which contains 4 box coordinates (and score). - ``track_id`` (int): The unique id for each human instance. If not provided, a dummy person result with a bbox covering the entire image will be used. Default: None. bbox_thr (float | None): Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, all boxes will be used. format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'. - `xyxy` means (left, top, right, bottom), - `xywh` means (left, top, width, height). dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. It is deprecated. Please use dataset_info instead. dataset_info (DatasetInfo): A class containing all dataset info. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned. Default: None. Returns: tuple: - pose_results (list[dict]): The bbox & pose info. \ Each item in the list is a dictionary, \ containing the bbox: (left, top, right, bottom, [score]) \ and the pose (ndarray[Kx3]): x, y, score. - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \ torch.Tensor[N, K, H, W]]]): \ Output feature maps from layers specified in `outputs`. \ Includes 'heatmap' if `return_heatmap` is True. """ # decide whether to use multi frames for inference if isinstance(imgs_or_paths, (list, tuple)): use_multi_frames = True else: assert isinstance(imgs_or_paths, (str, np.ndarray)) use_multi_frames = False # get dataset info if (dataset_info is None and hasattr(model, 'cfg') and 'dataset_info' in model.cfg): dataset_info = DatasetInfo(model.cfg.dataset_info) if dataset_info is None: warnings.warn( 'dataset is deprecated.' 'Please set `dataset_info` in the config.' 'Check https://github.com/open-mmlab/mmpose/pull/663' ' for details.', DeprecationWarning) # only two kinds of bbox format is supported. assert format in ['xyxy', 'xywh'] pose_results = [] returned_outputs = [] if person_results is None: # create dummy person results sample = imgs_or_paths[0] if use_multi_frames else imgs_or_paths if isinstance(sample, str): width, height = Image.open(sample).size else: height, width = sample.shape[:2] person_results = [{'bbox': np.array([0, 0, width, height])}] if len(person_results) == 0: return pose_results, returned_outputs # Change for-loop preprocess each bbox to preprocess all bboxes at once. bboxes = np.array([box['bbox'] for box in person_results]) # Select bboxes by score threshold if bbox_thr is not None: assert bboxes.shape[1] == 5 valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] bboxes = bboxes[valid_idx] person_results = [person_results[i] for i in valid_idx] if format == 'xyxy': bboxes_xyxy = bboxes bboxes_xywh = bbox_xyxy2xywh(bboxes) else: # format is already 'xywh' bboxes_xywh = bboxes bboxes_xyxy = bbox_xywh2xyxy(bboxes) # if bbox_thr remove all bounding box if len(bboxes_xywh) == 0: return [], [] with OutputHook(model, outputs=outputs, as_tensor=False) as h: # poses is results['pred'] # N x 17x 3 poses, heatmap = _inference_single_pose_model( model, imgs_or_paths, bboxes_xywh, dataset=dataset, dataset_info=dataset_info, return_heatmap=return_heatmap, use_multi_frames=use_multi_frames) if return_heatmap: h.layer_outputs['heatmap'] = heatmap returned_outputs.append(h.layer_outputs) assert len(poses) == len(person_results), print(len(poses), len(person_results), len(bboxes_xyxy)) for pose, person_result, bbox_xyxy in zip(poses, person_results, bboxes_xyxy): pose_result = person_result.copy() pose_result['keypoints'] = pose pose_result['bbox'] = bbox_xyxy pose_results.append(pose_result) return pose_results, returned_outputs
def inference_top_down_pose_model_multi(model, img_or_path_video, person_results_video, bbox_thr=None, format='xywh', dataset='TopDownCocoDataset', return_heatmap=False, outputs=None): """Inference a single image with a list of person bounding boxes. num_people: P num_keypoints: K bbox height: H bbox width: W Args: model (nn.Module): The loaded pose model. image_name (str| np.ndarray): Image_name person_results (List(dict)): the item in the dict may contain 'bbox' and/or 'track_id'. 'bbox' (4, ) or (5, ): The person bounding box, which contains 4 box coordinates (and score). 'track_id' (int): The unique id for each human instance. bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores will be fed into the pose detector. If bbox_thr is None, ignore it. format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. 'xyxy' means (left, top, right, bottom), 'xywh' means (left, top, width, height). dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. return_heatmap (bool) : Flag to return heatmap, default: False outputs (list(str) | tuple(str)) : Names of layers whose outputs need to be returned, default: None Returns: list[dict]: The bbox & pose info, Each item in the list is a dictionary, containing the bbox: (left, top, right, bottom, [score]) and the pose (ndarray[Kx3]): x, y, score list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]: Output feature maps from layers specified in `outputs`. Includes 'heatmap' if `return_heatmap` is True. """ # only two kinds of bbox format is supported. assert format in ['xyxy', 'xywh'] pose_results = [] returned_outputs = [] bbox_xyxy_list = [] bbox_xywh_list = [] merge_result = {} for person_results in person_results_video: if person_results == []: bbox_xywh_list.append([]) bbox_xyxy_list.append([]) continue with OutputHook(model, outputs=outputs, as_tensor=False) as h: for person_result in person_results: bbox_xyxy_list.append(person_result['bbox']) if format == 'xyxy': try: bbox_xyxy = np.expand_dims( np.array(person_result['bbox']), 0) except: print(person_results_video) bbox_xywh = _xyxy2xywh(bbox_xyxy) else: bbox_xywh = np.expand_dims(np.array(person_result['bbox']), 0) bbox_xyxy = _xywh2xyxy(bbox_xywh) if bbox_thr is not None: assert bbox_xywh.shape[1] == 5 if bbox_xywh[0, 4] < bbox_thr: continue bbox_xywh_list.append(bbox_xywh[0]) valid_index_list = [] unvalid_index_list = [] valid_img_or_path_video = [] valid_bbox_xywh_list = [] for k in range(len(bbox_xywh_list)): if bbox_xywh_list[k] == []: unvalid_index_list.append(k) continue else: valid_index_list.append(k) valid_img_or_path_video.append(img_or_path_video[k]) valid_bbox_xywh_list.append(bbox_xywh_list[k]) if len(valid_bbox_xywh_list) != 0: pose_valid, heatmap = _inference_single_pose_model( model, valid_img_or_path_video, valid_bbox_xywh_list, dataset, return_heatmap=return_heatmap) # if return_heatmap: # h.layer_outputs['heatmap'] = heatmap # returned_outputs.append(h.layer_outputs) pose = [] index_valid = 0 for k in range(len(img_or_path_video)): if k in valid_index_list: pose.append(pose_valid[index_valid]) index_valid += 1 else: pose.append(np.zeros((133, 3))) pose = np.stack(pose, axis=0) merge_result['keypoints'] = pose merge_result['bbox'] = bbox_xyxy_list # if format == 'xywh': # person_result['bbox'] = bbox_xyxy[0] # pose_results.append(person_result) return merge_result, returned_outputs, unvalid_index_list