Ejemplo n.º 1
0
def inference_top_down_pose_model(model,
                                  img_or_path,
                                  person_results,
                                  bbox_thr=None,
                                  format='xywh',
                                  dataset='TopDownCocoDataset',
                                  return_heatmap=False,
                                  outputs=None):
    """Inference a single image with a list of person bounding boxes.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        person_results (List(dict)): the item in the dict may contain
            'bbox' and/or 'track_id'.
            'bbox' (4, ) or (5, ): The person bounding box, which contains
            4 box coordinates (and score).
            'track_id' (int): The unique id for each human instance.
        bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores
            will be fed into the pose detector. If bbox_thr is None, ignore it.
        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
            'xyxy' means (left, top, right, bottom),
            'xywh' means (left, top, width, height).
        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[dict]: The bbox & pose info,
            Each item in the list is a dictionary,
            containing the bbox: (left, top, right, bottom, [score])
            and the pose (ndarray[Kx3]): x, y, score
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # only two kinds of bbox format is supported.
    assert format in ['xyxy', 'xywh']

    pose_results = []
    returned_outputs = []

    if len(person_results) == 0:
        return pose_results, returned_outputs

    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
    bboxes = np.array([box['bbox'] for box in person_results])

    # Select bboxes by score threshold
    if bbox_thr is not None:
        assert bboxes.shape[1] == 5
        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
        bboxes = bboxes[valid_idx]
        person_results = [person_results[i] for i in valid_idx]

    if format == 'xyxy':
        bboxes_xyxy = bboxes
        bboxes_xywh = _xyxy2xywh(bboxes)
    else:
        # format is already 'xywh'
        bboxes_xywh = bboxes
        bboxes_xyxy = _xywh2xyxy(bboxes)

    # if bbox_thr remove all bounding box
    if len(bboxes_xywh) == 0:
        return [], []

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # poses is results['pred'] # N x 17x 3
        poses, heatmap = _inference_single_pose_model(
            model,
            img_or_path,
            bboxes_xywh,
            dataset,
            return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = heatmap

        returned_outputs.append(h.layer_outputs)

    assert len(poses) == len(person_results), print(
        len(poses), len(person_results), len(bboxes_xyxy))
    for pose, person_result, bbox_xyxy in zip(poses, person_results,
                                              bboxes_xyxy):
        pose_result = person_result.copy()
        pose_result['keypoints'] = pose
        pose_result['bbox'] = bbox_xyxy
        pose_results.append(pose_result)

    return pose_results, returned_outputs
Ejemplo n.º 2
0
def inference_bottom_up_pose_model(model,
                                   img_or_path,
                                   return_heatmap=False,
                                   outputs=None):
    """Inference a single image.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[ndarray]: The predicted pose info.
            The length of the list is the number of people (P).
            Each item in the list is a ndarray, containing each person's
            pose (ndarray[Kx3]): x, y, score.
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    pose_results = []
    returned_outputs = []

    cfg = model.cfg
    device = next(model.parameters()).device

    # build the data pipeline
    channel_order = cfg.test_pipeline[0].get('channel_order', 'rgb')
    test_pipeline = [LoadImage(channel_order=channel_order)
                     ] + cfg.test_pipeline[1:]
    test_pipeline = Compose(test_pipeline)

    # prepare data
    data = {
        'img_or_path': img_or_path,
        'dataset': 'coco',
        'ann_info': {
            'image_size':
            cfg.data_cfg['image_size'],
            'num_joints':
            cfg.data_cfg['num_joints'],
            'flip_index':
            [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15],
        }
    }

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]
    else:
        # just get the actual data from DataContainer
        data['img_metas'] = data['img_metas'].data[0]

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # forward the model
        with torch.no_grad():
            result = model(
                img=data['img'],
                img_metas=data['img_metas'],
                return_loss=False,
                return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = result['output_heatmap']

        returned_outputs.append(h.layer_outputs)

        for pred in result['preds']:
            pose_results.append({
                'keypoints': pred[:, :3],
            })

    return pose_results, returned_outputs
Ejemplo n.º 3
0
def inference_top_down_pose_model(model,
                                  img_or_path,
                                  person_bboxes,
                                  bbox_thr=None,
                                  format='xywh',
                                  dataset='TopDownCocoDataset',
                                  return_heatmap=False,
                                  outputs=None):
    """Inference a single image with a list of person bounding boxes.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str| np.ndarray): Image_name
        person_bboxes: (np.ndarray[P x 4] or [P x 5]): Each person bounding box
            shaped (4, ) or (5, ), contains 4 box coordinates (and score).
        bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores
            will be fed into the pose detector. If bbox_thr is None, ignore it.
        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
            'xyxy' means (left, top, right, bottom),
            'xywh' means (left, top, width, height).
        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[dict]: The bbox & pose info,
            Each item in the list is a dictionary,
            containing the bbox: (left, top, right, bottom, [score])
            and the pose (ndarray[Kx3]): x, y, score
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # only two kinds of bbox format is supported.
    assert format in ['xyxy', 'xywh']
    # transform the bboxes format to xywh
    if format == 'xyxy':
        person_bboxes = _xyxy2xywh(np.array(person_bboxes))

    pose_results = []
    returned_outputs = []

    if len(person_bboxes) > 0:
        if bbox_thr is not None:
            person_bboxes = person_bboxes[person_bboxes[:, 4] > bbox_thr]

        with OutputHook(model, outputs=outputs, as_tensor=True) as h:
            for bbox in person_bboxes:
                pose, heatmap = _inference_single_pose_model(
                    model,
                    img_or_path,
                    bbox,
                    dataset,
                    return_heatmap=return_heatmap)

                if return_heatmap:
                    h.layer_outputs['heatmap'] = heatmap

                returned_outputs.append(h.layer_outputs)
                pose_results.append({
                    'bbox':
                    _xywh2xyxy(np.expand_dims(np.array(bbox), 0)),
                    'keypoints':
                    pose
                })

    return pose_results, returned_outputs
Ejemplo n.º 4
0
def inference_top_down_pose_model(model,
                                  img_or_path,
                                  person_results,
                                  bbox_thr=None,
                                  format='xywh',
                                  dataset='TopDownCocoDataset',
                                  return_heatmap=False,
                                  outputs=None):
    """Inference a single image with a list of person bounding boxes.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        person_results (List(dict)): the item in the dict may contain
            'bbox' and/or 'track_id'.
            'bbox' (4, ) or (5, ): The person bounding box, which contains
            4 box coordinates (and score).
            'track_id' (int): The unique id for each human instance.
        bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores
            will be fed into the pose detector. If bbox_thr is None, ignore it.
        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
            'xyxy' means (left, top, right, bottom),
            'xywh' means (left, top, width, height).
        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[dict]: The bbox & pose info,
            Each item in the list is a dictionary,
            containing the bbox: (left, top, right, bottom, [score])
            and the pose (ndarray[Kx3]): x, y, score
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # only two kinds of bbox format is supported.
    assert format in ['xyxy', 'xywh']

    pose_results = []
    returned_outputs = []

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        for person_result in person_results:
            if format == 'xyxy':
                bbox_xyxy = np.expand_dims(np.array(person_result['bbox']), 0)
                bbox_xywh = _xyxy2xywh(bbox_xyxy)
            else:
                bbox_xywh = np.expand_dims(np.array(person_result['bbox']), 0)
                bbox_xyxy = _xywh2xyxy(bbox_xywh)

            if bbox_thr is not None:
                assert bbox_xywh.shape[1] == 5
                if bbox_xywh[0, 4] < bbox_thr:
                    continue

            pose, heatmap = _inference_single_pose_model(
                model,
                img_or_path,
                bbox_xywh[0],
                dataset,
                return_heatmap=return_heatmap)

            if return_heatmap:
                h.layer_outputs['heatmap'] = heatmap

            returned_outputs.append(h.layer_outputs)

            person_result['keypoints'] = pose

            if format == 'xywh':
                person_result['bbox'] = bbox_xyxy[0]

            pose_results.append(person_result)

    return pose_results, returned_outputs
Ejemplo n.º 5
0
def inference_bottom_up_pose_model(model,
                                   img_or_path,
                                   dataset='BottomUpCocoDataset',
                                   dataset_info=None,
                                   pose_nms_thr=0.9,
                                   return_heatmap=False,
                                   outputs=None):
    """Inference a single image with a bottom-up pose model.

    Note:
        - num_people: P
        - num_keypoints: K
        - bbox height: H
        - bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        img_or_path (str| np.ndarray): Image filename or loaded image.
        dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'.
            It is deprecated. Please use dataset_info instead.
        dataset_info (DatasetInfo): A class containing all dataset info.
        pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9.
        return_heatmap (bool) : Flag to return heatmap, default: False.
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None.

    Returns:
        tuple:
        - pose_results (list[np.ndarray]): The predicted pose info. \
            The length of the list is the number of people (P). \
            Each item in the list is a ndarray, containing each \
            person's pose (np.ndarray[Kx3]): x, y, score.
        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
            torch.Tensor[N, K, H, W]]]): \
            Output feature maps from layers specified in `outputs`. \
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # get dataset info
    if (dataset_info is None and hasattr(model, 'cfg')
            and 'dataset_info' in model.cfg):
        dataset_info = DatasetInfo(model.cfg.dataset_info)

    if dataset_info is not None:
        dataset_name = dataset_info.dataset_name
        flip_index = dataset_info.flip_index
        sigmas = getattr(dataset_info, 'sigmas', None)
    else:
        warnings.warn(
            'dataset is deprecated.'
            'Please set `dataset_info` in the config.'
            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
            DeprecationWarning)
        assert (dataset == 'BottomUpCocoDataset')
        dataset_name = dataset
        flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
        sigmas = None

    pose_results = []
    returned_outputs = []

    cfg = model.cfg
    device = next(model.parameters()).device
    if device.type == 'cpu':
        device = -1

    # build the data pipeline
    test_pipeline = Compose(cfg.test_pipeline)
    _pipeline_gpu_speedup(test_pipeline, next(model.parameters()).device)

    # prepare data
    data = {
        'dataset': dataset_name,
        'ann_info': {
            'image_size': np.array(cfg.data_cfg['image_size']),
            'num_joints': cfg.data_cfg['num_joints'],
            'flip_index': flip_index,
        }
    }
    if isinstance(img_or_path, np.ndarray):
        data['img'] = img_or_path
    else:
        data['image_file'] = img_or_path

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    data = scatter(data, [device])[0]

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # forward the model
        with torch.no_grad():
            result = model(img=data['img'],
                           img_metas=data['img_metas'],
                           return_loss=False,
                           return_heatmap=return_heatmap)

        if return_heatmap:
            h.layer_outputs['heatmap'] = result['output_heatmap']

        returned_outputs.append(h.layer_outputs)

        for idx, pred in enumerate(result['preds']):
            area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * (
                np.max(pred[:, 1]) - np.min(pred[:, 1]))
            pose_results.append({
                'keypoints': pred[:, :3],
                'score': result['scores'][idx],
                'area': area,
            })

        # pose nms
        score_per_joint = cfg.model.test_cfg.get('score_per_joint', False)
        keep = oks_nms(pose_results,
                       pose_nms_thr,
                       sigmas,
                       score_per_joint=score_per_joint)
        pose_results = [pose_results[_keep] for _keep in keep]

    return pose_results, returned_outputs
Ejemplo n.º 6
0
def inference_top_down_pose_model(model,
                                  imgs_or_paths,
                                  person_results=None,
                                  bbox_thr=None,
                                  format='xywh',
                                  dataset='TopDownCocoDataset',
                                  dataset_info=None,
                                  return_heatmap=False,
                                  outputs=None):
    """Inference a single image with a list of person bounding boxes. Support
    single-frame and multi-frame inference setting.

    Note:
        - num_frames: F
        - num_people: P
        - num_keypoints: K
        - bbox height: H
        - bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        imgs_or_paths (str | np.ndarray | list(str) | list(np.ndarray)):
            Image filename(s) or loaded image(s).
        person_results (list(dict), optional): a list of detected persons that
            contains ``bbox`` and/or ``track_id``:

            - ``bbox`` (4, ) or (5, ): The person bounding box, which contains
                4 box coordinates (and score).
            - ``track_id`` (int): The unique id for each human instance. If
                not provided, a dummy person result with a bbox covering
                the entire image will be used. Default: None.
        bbox_thr (float | None): Threshold for bounding boxes. Only bboxes
            with higher scores will be fed into the pose detector.
            If bbox_thr is None, all boxes will be used.
        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.

            - `xyxy` means (left, top, right, bottom),
            - `xywh` means (left, top, width, height).
        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
            It is deprecated. Please use dataset_info instead.
        dataset_info (DatasetInfo): A class containing all dataset info.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned. Default: None.

    Returns:
        tuple:
        - pose_results (list[dict]): The bbox & pose info. \
            Each item in the list is a dictionary, \
            containing the bbox: (left, top, right, bottom, [score]) \
            and the pose (ndarray[Kx3]): x, y, score.
        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
            torch.Tensor[N, K, H, W]]]): \
            Output feature maps from layers specified in `outputs`. \
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # decide whether to use multi frames for inference
    if isinstance(imgs_or_paths, (list, tuple)):
        use_multi_frames = True
    else:
        assert isinstance(imgs_or_paths, (str, np.ndarray))
        use_multi_frames = False
    # get dataset info
    if (dataset_info is None and hasattr(model, 'cfg')
            and 'dataset_info' in model.cfg):
        dataset_info = DatasetInfo(model.cfg.dataset_info)
    if dataset_info is None:
        warnings.warn(
            'dataset is deprecated.'
            'Please set `dataset_info` in the config.'
            'Check https://github.com/open-mmlab/mmpose/pull/663'
            ' for details.', DeprecationWarning)

    # only two kinds of bbox format is supported.
    assert format in ['xyxy', 'xywh']

    pose_results = []
    returned_outputs = []

    if person_results is None:
        # create dummy person results
        sample = imgs_or_paths[0] if use_multi_frames else imgs_or_paths
        if isinstance(sample, str):
            width, height = Image.open(sample).size
        else:
            height, width = sample.shape[:2]
        person_results = [{'bbox': np.array([0, 0, width, height])}]

    if len(person_results) == 0:
        return pose_results, returned_outputs

    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
    bboxes = np.array([box['bbox'] for box in person_results])

    # Select bboxes by score threshold
    if bbox_thr is not None:
        assert bboxes.shape[1] == 5
        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
        bboxes = bboxes[valid_idx]
        person_results = [person_results[i] for i in valid_idx]

    if format == 'xyxy':
        bboxes_xyxy = bboxes
        bboxes_xywh = bbox_xyxy2xywh(bboxes)
    else:
        # format is already 'xywh'
        bboxes_xywh = bboxes
        bboxes_xyxy = bbox_xywh2xyxy(bboxes)

    # if bbox_thr remove all bounding box
    if len(bboxes_xywh) == 0:
        return [], []

    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
        # poses is results['pred'] # N x 17x 3
        poses, heatmap = _inference_single_pose_model(
            model,
            imgs_or_paths,
            bboxes_xywh,
            dataset=dataset,
            dataset_info=dataset_info,
            return_heatmap=return_heatmap,
            use_multi_frames=use_multi_frames)

        if return_heatmap:
            h.layer_outputs['heatmap'] = heatmap

        returned_outputs.append(h.layer_outputs)

    assert len(poses) == len(person_results), print(len(poses),
                                                    len(person_results),
                                                    len(bboxes_xyxy))
    for pose, person_result, bbox_xyxy in zip(poses, person_results,
                                              bboxes_xyxy):
        pose_result = person_result.copy()
        pose_result['keypoints'] = pose
        pose_result['bbox'] = bbox_xyxy
        pose_results.append(pose_result)

    return pose_results, returned_outputs
Ejemplo n.º 7
0
def inference_top_down_pose_model_multi(model,
                                        img_or_path_video,
                                        person_results_video,
                                        bbox_thr=None,
                                        format='xywh',
                                        dataset='TopDownCocoDataset',
                                        return_heatmap=False,
                                        outputs=None):
    """Inference a single image with a list of person bounding boxes.

    num_people: P
    num_keypoints: K
    bbox height: H
    bbox width: W

    Args:
        model (nn.Module): The loaded pose model.
        image_name (str| np.ndarray): Image_name
        person_results (List(dict)): the item in the dict may contain
            'bbox' and/or 'track_id'.
            'bbox' (4, ) or (5, ): The person bounding box, which contains
            4 box coordinates (and score).
            'track_id' (int): The unique id for each human instance.
        bbox_thr: Threshold for bounding boxes. Only bboxes with higher scores
            will be fed into the pose detector. If bbox_thr is None, ignore it.
        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
            'xyxy' means (left, top, right, bottom),
            'xywh' means (left, top, width, height).
        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
        return_heatmap (bool) : Flag to return heatmap, default: False
        outputs (list(str) | tuple(str)) : Names of layers whose outputs
            need to be returned, default: None

    Returns:
        list[dict]: The bbox & pose info,
            Each item in the list is a dictionary,
            containing the bbox: (left, top, right, bottom, [score])
            and the pose (ndarray[Kx3]): x, y, score
        list[dict[np.ndarray[N, K, H, W] | torch.tensor[N, K, H, W]]]:
            Output feature maps from layers specified in `outputs`.
            Includes 'heatmap' if `return_heatmap` is True.
    """
    # only two kinds of bbox format is supported.
    assert format in ['xyxy', 'xywh']

    pose_results = []
    returned_outputs = []

    bbox_xyxy_list = []
    bbox_xywh_list = []
    merge_result = {}

    for person_results in person_results_video:
        if person_results == []:
            bbox_xywh_list.append([])
            bbox_xyxy_list.append([])
            continue
        with OutputHook(model, outputs=outputs, as_tensor=False) as h:
            for person_result in person_results:
                bbox_xyxy_list.append(person_result['bbox'])
                if format == 'xyxy':
                    try:
                        bbox_xyxy = np.expand_dims(
                            np.array(person_result['bbox']), 0)
                    except:
                        print(person_results_video)
                    bbox_xywh = _xyxy2xywh(bbox_xyxy)
                else:
                    bbox_xywh = np.expand_dims(np.array(person_result['bbox']),
                                               0)
                    bbox_xyxy = _xywh2xyxy(bbox_xywh)

                if bbox_thr is not None:
                    assert bbox_xywh.shape[1] == 5
                    if bbox_xywh[0, 4] < bbox_thr:
                        continue
                bbox_xywh_list.append(bbox_xywh[0])

    valid_index_list = []
    unvalid_index_list = []
    valid_img_or_path_video = []
    valid_bbox_xywh_list = []
    for k in range(len(bbox_xywh_list)):
        if bbox_xywh_list[k] == []:
            unvalid_index_list.append(k)
            continue
        else:
            valid_index_list.append(k)
            valid_img_or_path_video.append(img_or_path_video[k])
            valid_bbox_xywh_list.append(bbox_xywh_list[k])

    if len(valid_bbox_xywh_list) != 0:
        pose_valid, heatmap = _inference_single_pose_model(
            model,
            valid_img_or_path_video,
            valid_bbox_xywh_list,
            dataset,
            return_heatmap=return_heatmap)

        # if return_heatmap:
        #     h.layer_outputs['heatmap'] = heatmap
        # returned_outputs.append(h.layer_outputs)
    pose = []
    index_valid = 0
    for k in range(len(img_or_path_video)):
        if k in valid_index_list:
            pose.append(pose_valid[index_valid])
            index_valid += 1
        else:
            pose.append(np.zeros((133, 3)))
    pose = np.stack(pose, axis=0)

    merge_result['keypoints'] = pose
    merge_result['bbox'] = bbox_xyxy_list
    # if format == 'xywh':
    #     person_result['bbox'] = bbox_xyxy[0]
    # pose_results.append(person_result)

    return merge_result, returned_outputs, unvalid_index_list