def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0):
    # kpts: (M, T, N, 2)
    norm_seqs = []
    for kpt in kpts:
        norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height)
        norm_seqs.append(norm_kpt)

    gen = UnchunkedGenerator(None,
                             None,
                             norm_seqs,
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)

    prediction_to_world = []
    for i in range(len(prediction)):
        sub_prediction = prediction[i][0]
        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)
        sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2])
        prediction_to_world.append(sub_prediction)

    return prediction_to_world
Beispiel #2
0
def interface(model_pos, keypoints, W, H):
    # input (N, 17, 2) return (N, 17, 3)
    if not isinstance(keypoints, np.ndarray):
        keypoints = np.array(keypoints)

    from common.camera import normalize_screen_coordinates_new, camera_to_world, normalize_screen_coordinates
    #  keypoints = normalize_screen_coordinates_new(keypoints[..., :2], w=W, h=H)
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=1000,
                                             h=1002)
    input_keypoints = keypoints.copy()
    # test_time_augmentation True
    from common.generators import UnchunkedGenerator
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=common.pad,
                             causal_shift=common.causal_shift,
                             augment=True,
                             kps_left=common.kps_left,
                             kps_right=common.kps_right,
                             joints_left=common.joints_left,
                             joints_right=common.joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)
    prediction = camera_to_world(prediction, R=common.rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction
Beispiel #3
0
    def __init__(self, path, remove_static_joints=True):
        super().__init__(fps=50, skeleton=h36m_skeleton)

        # [SWAPPED ORDER]
        # Load serialized dataset
        data = np.load(path, allow_pickle=True)['positions_3d'].item()

        self._cameras = Human36mDataset.__ext_gen(data)
        # self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)

        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):

                # using the same camera for each view and eliminate distorsion
                # cam.update(h36m_cameras_intrinsic_params[i])
                cam.update(modified_cameras_intrinsic_params[i])

                for k, v in cam.items():
                    if k not in ['id', 'res_w', 'res_h']:
                        cam[k] = np.array(v, dtype='float32')

                # Normalize camera frame
                cam['center'] = normalize_screen_coordinates(
                    cam['center'], w=cam['res_w'],
                    h=cam['res_h']).astype('float32')
                cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2
                if 'translation' in cam:
                    cam['translation'] = cam[
                        'translation'] / 1000  # mm to meters

                # Add intrinsic parameters vector
                cam['intrinsic'] = np.concatenate(
                    (cam['focal_length'], cam['center'],
                     cam['radial_distortion'], cam['tangential_distortion']))

        # # Load serialized dataset
        # data = np.load(path, allow_pickle=True)['positions_3d'].item()

        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    'positions': positions,
                    'cameras': self._cameras[subject],
                }

        if remove_static_joints:
            # Bring the skeleton to 17 joints instead of the original 32
            self.remove_joints(
                [4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])

            # Rewire shoulders to the correct parents
            self._skeleton._parents[11] = 8
            self._skeleton._parents[14] = 8
    def __init__(self, path, keypoints_type='cnp_ft_h36m_dbb', remove_static_joints=True):
        super().__init__(fps=50, skeleton=h36m_skeleton)

        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):
                cam.update(h36m_cameras_intrinsic_params[i])
                for k, v in cam.items():
                    if k not in ["id", "res_w", "res_h"]:
                        cam[k] = np.array(v, dtype="float32")

                # Normalize camera frame
                cam["center"] = normalize_screen_coordinates(cam["center"], w=cam["res_w"], h=cam["res_h"]).astype("float32")
                cam["focal_length"] = cam["focal_length"] / cam["res_w"] * 2
                if "translation" in cam:
                    cam["translation"] = cam["translation"] / 1000  # mm to meters

                # Add intrinsic parameters vectors
                cam["intrinsic"] = np.concatenate((cam["focal_length"],
                                                   cam["center"],
                                                   cam["radial_distortion"],
                                                   cam["tangential_distortion"]))

        # Load serialized dataset
        data = np.load(path, allow_pickle=True)["positions_3d"].item()

        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    "positions": positions,
                    "cameras": self._cameras[subject]
                }

        print(keypoints_type)
        if remove_static_joints and keypoints_type == 'sh_ft_h36m' or keypoints_type == 'sh_pt_mpii':
            # Bring the skeleton to 16 joints instead of the original 32
            joints = []
            for i, x in enumerate(H36M_NAMES):
                if x == '' or x == 'Neck/Nose':  # Remove 'Nose' to make SH and H36M 2D poses have the same dimension
                    joints.append(i)
            self.remove_joints(joints)

            # Rewire shoulders to the correct parents
            self._skeleton._parents[10] = 8
            self._skeleton._parents[13] = 8

        else:
            # Bring the skeleton to 17 joints instead of the original 32
            self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])

            # Rewire shoulders to the correct parents
            self._skeleton._parents[11] = 8
            self._skeleton._parents[14] = 8
Beispiel #5
0
def preprocess():
    for subject in dataset.subjects():
        assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(
            subject)
        for action in dataset[subject].keys():
            assert action in keypoints[
                subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(
                    action, subject)
            if 'positions_3d' not in dataset[subject][action]:
                continue

            for cam_idx in range(len(keypoints[subject][action])):

                # We check for >= instead of == because some videos in H3.6M contain extra frames
                mocap_length = dataset[subject][action]['positions_3d'][
                    cam_idx].shape[0]
                assert keypoints[subject][action][cam_idx].shape[
                    0] >= mocap_length

                if keypoints[subject][action][cam_idx].shape[0] > mocap_length:
                    # Shorten sequence
                    keypoints[subject][action][cam_idx] = keypoints[subject][
                        action][cam_idx][:mocap_length]

            assert len(keypoints[subject][action]) == len(
                dataset[subject][action]['positions_3d'])

    for subject in keypoints.keys():
        for action in keypoints[subject]:
            for cam_idx, kps in enumerate(keypoints[subject][action]):
                # Normalize camera frame
                cam = dataset.cameras()[subject][cam_idx]
                if args.keypoint_probs:
                    kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                                w=cam['res_w'],
                                                                h=cam['res_h'])
                else:
                    kps = normalize_screen_coordinates(kps[..., :2],
                                                       w=cam['res_w'],
                                                       h=cam['res_h'])
                keypoints[subject][action][cam_idx] = kps
    def __init__(self, path, remove_static_joints=True):
        super(Human36mDataset, self).__init__(skeleton=h36m_skeleton, fps=50)

        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):
                cam.update(h36m_cameras_intrinsic_params[i])
                for k, v in cam.items():
                    if k not in ['id', 'res_w', 'res_h']:
                        cam[k] = np.array(v, dtype='float32')

                # Normalize camera frame
                cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype(
                    'float32')
                cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2.0
                if 'translation' in cam:
                    cam['translation'] = cam['translation'] / 1000  # mm to meters

                # Add intrinsic parameters vector
                cam['intrinsic'] = np.concatenate((cam['focal_length'],
                                                   cam['center'],
                                                   cam['radial_distortion'],
                                                   cam['tangential_distortion']))

        # Load serialized dataset
        data = np.load(path, allow_pickle=True)['positions_3d'].item()

        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    'positions': positions,
                    'cameras': self._cameras[subject],
                }

        if remove_static_joints:
            # Bring the skeleton to 16 joints instead of the original 32
            joints = []
            for i, x in enumerate(H36M_NAMES):
                if x == '' or x == 'Neck/Nose':  # Remove 'Nose' to make SH(houglass?) and H36M 2D poses have the same dimension
                # if x == '':
                    joints.append(i)
            self.remove_joints(joints)

            # Rewire shoulders to the correct parents
            self._skeleton._parents[10] = 8
            self._skeleton._parents[13] = 8

            # Set joints group
            self._skeleton._joints_group = h36m_skeleton_joints_group
Beispiel #7
0
    def __init__(self, path, remove_static_joints=True):
        super(Human36mDataset, self).__init__(skeleton=h36m_skeleton, fps=50)   # 父类初始化

        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)    # h36m数据集的相机外部参数
        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):
                cam.update(h36m_cameras_intrinsic_params[i])    # 将相机内部参数update到cam(相机外部参数中)
                for k, v in cam.items():    # 取出字典中的key和value
                    if k not in ['id', 'res_w', 'res_h']:   # 如果没有这些关键字
                        cam[k] = np.array(v, dtype='float32')   # 将value值都变成np.array的float32类型

                # Normalize camera frame 归一化相机框架
                cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype(
                    'float32')  # center归一化
                cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2.0  # 焦距归一化
                if 'translation' in cam:    # 如果是相机外部参数,含有translation关键字
                    cam['translation'] = cam['translation'] / 1000  # mm to meters  / 由毫米变成米

                # Add intrinsic parameters vector   / 添加内部参数向量  # 数组拼接
                cam['intrinsic'] = np.concatenate((cam['focal_length'],     # 焦距
                                                   cam['center'],           # 中心
                                                   cam['radial_distortion'],# 径向畸变系数
                                                   cam['tangential_distortion']))   # 切向畸变

        # Load serialized dataset   /负载序列化数据集
        data = np.load(path, allow_pickle=True)['positions_3d'].item()

        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    'positions': positions,     # 位置
                    'cameras': self._cameras[subject],  # 相机外部参数
                }

        if remove_static_joints:    # 删除静态关节
            # Bring the skeleton to 16 joints instead of the original 32    / 让骨骼有16个关节,而不是原来的32个
            joints = []
            for i, x in enumerate(H36M_NAMES):  # 移除“鼻子”,使SH和H36M 2D姿势具有相同的尺寸
                if x == '' or x == 'Neck/Nose':  # Remove 'Nose' to make SH and H36M 2D poses have the same dimension
                    joints.append(i)
            self.remove_joints(joints)

            # Rewire shoulders to the correct parents   / 把肩膀重新连接到正确的父母身上
            self._skeleton._parents[10] = 8
            self._skeleton._parents[13] = 8

            # Set joints group 设置关节组
            self._skeleton._joints_group = h36m_skeleton_joints_group
Beispiel #8
0
def create_2d_data(data_path, dataset):
    keypoints = np.load(data_path, allow_pickle=True)
    keypoints = keypoints['positions_2d'].item()

    for subject in keypoints.keys():
        for action in keypoints[subject]:
            for cam_idx, kps in enumerate(keypoints[subject][action]):
                # Normalize camera frame
                cam = dataset.cameras()[subject][cam_idx]
                kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                            w=cam['res_w'],
                                                            h=cam['res_h'])
                keypoints[subject][action][cam_idx] = kps

    return keypoints
Beispiel #9
0
    def __init__(self, path, remove_static_joints=True):
        super().__init__(fps=50, skeleton=h36m_skeleton)

        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):
                cam.update(h36m_cameras_intrinsic_params[i])
                for k, v in cam.items():
                    if k not in ["id", "res_w", "res_h"]:
                        cam[k] = np.array(v, dtype="float32")

                # Normalize camera frame
                cam["center"] = normalize_screen_coordinates(
                    cam["center"], w=cam["res_w"],
                    h=cam["res_h"]).astype("float32")
                cam["focal_length"] = cam["focal_length"] / cam["res_w"] * 2
                if "translation" in cam:
                    cam["translation"] = cam[
                        "translation"] / 1000  # mm to meters

                # Add intrinsic parameters vectors
                cam["intrinsic"] = np.concatenate(
                    (cam["focal_length"], cam["center"],
                     cam["radial_distortion"], cam["tangential_distortion"]))

        # Load serialized dataset
        data = np.load(path, allow_pickle=True)["positions_3d"].item()

        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    "positions": positions,
                    "cameras": self._cameras[subject]
                }
        if remove_static_joints:
            # Bring the skeleton to 17 joints instead of the original 32
            self.remove_joints(
                [4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])

            # Rewire shoulders to the correct parents
            self._skeleton._parents[11] = 8
            self._skeleton._parents[14] = 8
 def __init__(self, path, remove_static_joints=True):
     super().__init__(fps=50, skeleton=h36m_skeleton)
     
     self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
     for cameras in self._cameras.values():
         for i, cam in enumerate(cameras):
             cam.update(h36m_cameras_intrinsic_params[i])
             for k, v in cam.items():
                 if k not in ['id', 'res_w', 'res_h']:
                     cam[k] = np.array(v, dtype='float32')
             
             # Normalize camera frame
             cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
             cam['focal_length'] = cam['focal_length']/cam['res_w']*2
             if 'translation' in cam:
                 cam['translation'] = cam['translation']/1000 # mm to meters
             
             # Add intrinsic parameters vector
             cam['intrinsic'] = np.concatenate((cam['focal_length'],
                                                cam['center'],
                                                cam['radial_distortion'],
                                                cam['tangential_distortion']))
     
     # Load serialized dataset
     data = np.load(path)['positions_3d'].item()
     
     self._data = {}
     for subject, actions in data.items():
         self._data[subject] = {}
         for action_name, positions in actions.items():
             self._data[subject][action_name] = {
                 'positions': positions,
                 'cameras': self._cameras[subject],
             }
             
     if remove_static_joints:
         # Bring the skeleton to 17 joints instead of the original 32
         self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
         
         # Rewire shoulders to the correct parents
         self._skeleton._parents[11] = 8
         self._skeleton._parents[14] = 8
Beispiel #11
0
    def init_keypoints(self):
        self.keypoints = np.load(self.dataset_path, allow_pickle=True)
        keypoints_metadata = self.keypoints['metadata'].item()
        keypoints_symmetry = keypoints_metadata['keypoints_symmetry']
        self.keypoints_left, self.keypoints_right = list(
            keypoints_symmetry[0]), list(keypoints_symmetry[1])
        self.joints_left, self.joints_right = list(
            self.dataset.skeleton().joints_left()), list(
                self.dataset.skeleton().joints_right())
        self.keypoints = self.keypoints['positions_2d'].item()

        for subject in self.keypoints.keys():
            for action in self.keypoints[subject]:
                for cam_idx, kps in enumerate(self.keypoints[subject][action]):
                    # Normalize camera frame
                    cam = self.dataset.cameras()[subject][cam_idx]
                    kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                                w=cam['res_w'],
                                                                h=cam['res_h'])
                    self.keypoints[subject][action][cam_idx] = kps
def gen_pose(kpts,
             valid_frames,
             width,
             height,
             model_pos,
             pad,
             causal_shift=0):
    assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape)
    assert kpts.shape[0] == len(valid_frames)

    norm_seqs = []
    for index, frames in enumerate(valid_frames):
        seq_kps = kpts[index, frames]
        norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height)
        norm_seqs.append(norm_seq_kps)

    gen = UnchunkedGenerator(None,
                             None,
                             norm_seqs,
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)

    prediction_to_world = []
    for i in range(len(prediction)):
        sub_prediction = prediction[i]

        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)

        # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1)
        # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2])

        prediction_to_world.append(sub_prediction)

    # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32)
    return prediction_to_world
def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0):
    # input (N, 17, 2) return (N, 17, 3)
    if not isinstance(kpts, np.ndarray):
        kpts = np.array(kpts)

    keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height)

    input_keypoints = keypoints.copy()
    # test_time_augmentation True
    from common.generators import UnchunkedGenerator
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)
    prediction = camera_to_world(prediction[0], R=rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction
Beispiel #14
0
            if keypoints[subject][action][cam_idx].shape[0] > mocap_length:
                # Shorten sequence
                keypoints[subject][action][cam_idx] = keypoints[subject][
                    action][cam_idx][:mocap_length]

        assert len(keypoints[subject][action]) == len(
            dataset[subject][action]['positions_3d'])

for subject in keypoints.keys():
    for action in keypoints[subject]:
        for cam_idx, kps in enumerate(keypoints[subject][action]):
            # Normalize camera frame
            cam = dataset.cameras()[subject][cam_idx]
            kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                        w=cam['res_w'],
                                                        h=cam['res_h'])
            keypoints[subject][action][cam_idx] = kps

subjects_train = args['subjects_train'].split(',')
subjects_semi = [] if not args['subjects_unlabeled'] else args[
    'subjects_unlabeled'].split(',')
subjects_test = args['subjects_test'].split(',')


def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True):
    out_poses_3d = []
    out_poses_2d = []
    out_camera_params = []
    for subject in subjects:
        for action in keypoints[subject].keys():
def reconstruction(args):
    """
    Generate 3D poses from 2D keypoints detected from video, and visualize it
        :param chk_file: The file path of model weight
        :param kps_file: The file path of 2D keypoints
        :param viz_output: The output path of animation
        :param video_path: The input video path
        :param kpts_format: The format of 2D keypoints, like MSCOCO, MPII, H36M, OpenPose. The default format is H36M
    """

    print('Loading 2D keypoints ...')
    keypoints, scores, _, _ = load_json(args.keypoints_file)

    # Loading only one person's keypoints
    if len(keypoints.shape) == 4:
        keypoints = keypoints[0]
    assert len(keypoints.shape) == 3

    # Transform the keypoints format from different dataset (MSCOCO, MPII) to h36m format
    if args.kpts_format == 'coco':
        keypoints, valid_frames = coco_h36m(keypoints)
    elif args.kpts_format == 'mpii':
        keypoints, valid_frames = mpii_h36m(keypoints)
    elif args.kpts_format == 'openpose':
        # Convert 'Openpose' format to MSCOCO
        order_coco = [i for i in range(17) if i != 1]
        keypoints = keypoints[:order_coco]
        keypoints, valid_frames = coco_h36m(keypoints)
    else:
        valid_frames = np.where(
            np.sum(keypoints.reshape(-1, 34), axis=1) != 0)[0]
        assert args.kpts_format == 'h36m'

    # Get the width and height of video
    cap = cv2.VideoCapture(args.video_path)
    width = int(round(cap.get(cv2.CAP_PROP_FRAME_WIDTH)))
    height = int(round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))

    # normalize keypoints
    input_keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                                   w=width,
                                                   h=height)

    if args.frames == 27:
        filter_widths = [3, 3, 3]
        channels = 128
    elif args.frames == 81:
        filter_widths = [3, 3, 3, 3]
        channels = 64
    else:
        filter_widths = [3, 3, 3, 3, 3]
        channels = 32

    model_pos = SpatioTemporalModel(adj,
                                    17,
                                    2,
                                    17,
                                    filter_widths=filter_widths,
                                    channels=channels,
                                    dropout=0.05)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    # load trained model
    print('Loading checkpoint', args.weight)
    chk_file = os.path.join('./checkpoint', args.weight)
    checkpoint = torch.load(chk_file,
                            map_location=lambda storage, loc: storage)
    model_pos.load_state_dict(checkpoint['model_pos'])

    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Reconstructing ...')
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints[valid_frames]],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])

    prediction_new = np.zeros((*input_keypoints.shape[:-1], 3),
                              dtype=np.float32)
    prediction_new[valid_frames] = prediction

    print('Rendering ...')
    anim_output = {'Reconstruction': prediction_new}
    render_animation(keypoints,
                     keypoints_metadata,
                     anim_output,
                     h36m_skeleton,
                     25,
                     3000,
                     np.array(70., dtype=np.float32),
                     args.viz_output,
                     limit=-1,
                     downsample=1,
                     size=5,
                     input_video_path=args.video_path,
                     viewport=(width, height),
                     input_video_skip=0)
Beispiel #16
0
    def __init__(self, path, remove_static_joints=True):
        super().__init__(fps=50, skeleton=h36m_skeleton)
        
        self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params)
        for cameras in self._cameras.values():
            for i, cam in enumerate(cameras):
                cam.update(h36m_cameras_intrinsic_params[i])
                for k, v in cam.items():
                    if k not in ['id', 'res_w', 'res_h']:
                        cam[k] = np.array(v, dtype='float32')
                
                # Normalize camera frame
                cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
                cam['focal_length'] = cam['focal_length']/cam['res_w']*2
                if 'translation' in cam:
                    cam['translation'] = cam['translation']/1000 # mm to meters
                
                # Add intrinsic parameters vector
                cam['intrinsic'] = np.concatenate((cam['focal_length'],
                                                   cam['center'],
                                                   cam['radial_distortion'],
                                                   cam['tangential_distortion']))
        
        # Load serialized dataset
        data = np.load(path)['positions_3d'].item()
        
        self._data = {}
        for subject, actions in data.items():
            self._data[subject] = {}
            for action_name, positions in actions.items():
                self._data[subject][action_name] = {
                    'positions': positions,
                    'cameras': self._cameras[subject],
                }
                
        if remove_static_joints:
            print("Remove static joints")
            # Bring the skeleton to 17 joints instead of the original 32
            self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
            
            # Rewire shoulders to the correct parents
            self._skeleton._parents[11] = 8
            self._skeleton._parents[14] = 8

            #print("parents", self._skeleton.parents())

            # 
            self._skeleton._meta[PELVIS] = {"location":"center", "joint":"pelvis"}
            self._skeleton._meta[SPINE] = {"location":"center", "joint":"spine"}
            self._skeleton._meta[CHEST] = {"location":"center", "joint":"chest"}
            self._skeleton._meta[NECK] = {"location":"center", "joint":"neck"}
            self._skeleton._meta[HEAD] = {"location":"center", "joint":"head"}
            
            self._skeleton._meta[L_HIP] = {"location":"left", "joint":"hip"}
            self._skeleton._meta[L_KNEE] = {"location":"left", "joint":"knee"}
            self._skeleton._meta[L_FOOT] = {"location":"left", "joint":"foot"}
            self._skeleton._meta[L_SHOULDER] = {"location":"left", "joint":"shoulder"}
            self._skeleton._meta[L_ELBOW] = {"location":"left", "joint":"elbow"}
            self._skeleton._meta[L_HAND] = {"location":"left", "joint":"hand"}
            
            self._skeleton._meta[R_HIP] = {"location":"right", "joint":"hip"}
            self._skeleton._meta[R_KNEE] = {"location":"right", "joint":"knee"}
            self._skeleton._meta[R_FOOT] = {"location":"right", "joint":"foot"}
            self._skeleton._meta[R_SHOULDER] = {"location":"right", "joint":"shoulder"}
            self._skeleton._meta[R_ELBOW] = {"location":"right", "joint":"elbow"}
            self._skeleton._meta[R_HAND]  = {"location":"right", "joint":"hand"}
Beispiel #17
0
def videpose_infer(args):
    from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates
    from common.generators import UnchunkedGenerator
    from common.model import TemporalModel
    from common.utils import Timer, evaluate, add_path
    from videopose import get_detector_2d, ckpt_time, metadata, time0

    import gene_npz

    gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut")
    print(gene_npz.args)
    # detector_2d = get_detector_2d(args.detector_2d)
    detector_2d = gene_npz.generate_kpts(args.detector_2d)

    assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)'

    # 2D kpts loads or generate
    if not args.input_npz:
        video_name = args.viz_video
        keypoints = detector_2d(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  # (N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(
        keypoints_symmetry[0]), list(keypoints_symmetry[1])
    joints_left, joints_right = list(
        [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    keypoints = normalize_screen_coordinates(
        keypoints[..., :2], w=1000, h=1002)

    model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('-------------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(
        args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(
        chk_filename, map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None, None, [input_keypoints],
                             pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    # save 3D joint points
    np.save(args.viz_output / "test_3d_output.npy",
            prediction, allow_pickle=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408,
                   0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(
        input_keypoints[..., :2], w=1000, h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt))

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))
Beispiel #18
0
def main():
    dataset_path = "./data/data_3d_h36m.npz"    # 加载数据
    from common.h36m_dataset import Human36mDataset
    dataset = Human36mDataset(dataset_path)
    dataset = read_3d_data(dataset)
    cudnn.benchmark = True
    device = torch.device("cpu")
    from models.sem_gcn import SemGCN
    from common.graph_utils import adj_mx_from_skeleton
    p_dropout = None
    adj = adj_mx_from_skeleton(dataset.skeleton())
    model_pos = SemGCN(adj, 128, num_layers=4, p_dropout=p_dropout,
                       nodes_group=dataset.skeleton().joints_group()).to(device)
    ckpt_path = "./checkpoint/pretrained/ckpt_semgcn_nonlocal_sh.pth.tar"
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model_pos.load_state_dict(ckpt['state_dict'], False)
    model_pos.eval()
    # ============ 新增代码 ==============
    # 从项目处理2d数据的代码中输出的一个人体数据
    inputs_2d = [[483.0, 450], [503, 450], [503, 539], [496, 622], [469, 450], [462, 546], [469, 622], [483, 347],
                 [483, 326], [489, 264], [448, 347], [448, 408], [441, 463], [517, 347], [524, 408], [538, 463]]

    # # openpose的测试样例识别结果
    # inputs_2d = [[86.0, 137], [99, 128], [94, 127], [97, 110], [89, 105], [102, 129], [116, 116], [99, 110],
    #              [105, 93], [117, 69], [147, 63], [104, 93], [89, 69], [82, 38], [89, 139], [94, 140]]

    inputs_2d = np.array(inputs_2d)
    # inputs_2d[:, 1] = np.max(inputs_2d[:, 1]) - inputs_2d[:, 1]   # 变成正的人体姿态,原始数据为倒立的

    cam = dataset.cameras()['S1'][0]    # 获取相机参数
    inputs_2d[..., :2] = normalize_screen_coordinates(inputs_2d[..., :2], w=cam['res_w'], h=cam['res_h'])  # 2d坐标处理

    # 画出归一化屏幕坐标并且标记序号的二维关键点图像
    print(inputs_2d)    # 打印归一化后2d关键点坐标
    d_x = inputs_2d[:, 0]
    d_y = inputs_2d[:, 1]
    plt.figure()
    plt.scatter(d_x, d_y)
    for i, txt in enumerate(np.arange(inputs_2d.shape[0])):
        plt.annotate(txt, (d_x[i], d_y[i]))     # 标号
    # plt.show()      # 显示2d关键点归一化后的图像

    # 获取3d结果
    inputs_2d = torch.tensor(inputs_2d, dtype=torch.float32)    # 转换为张量
    outputs_3d = model_pos(inputs_2d).cpu()         # 加载模型
    outputs_3d[:, :, :] -= outputs_3d[:, :1, :]     # Remove global offset / 移除全球偏移
    predictions = [outputs_3d.detach().numpy()]     # 预测结果
    prediction = np.concatenate(predictions)[0]     # 累加取第一个
    # Invert camera transformation  / 反相机的转换
    prediction = camera_to_world(prediction, R=cam['orientation'], t=0)     # R和t的参数设置影响不大,有多种写法和选取的相机参数有关,有些S没有t等等问题
    prediction[:, 2] -= np.min(prediction[:, 2])    # 向上偏移min(prediction[:, 2]),作用是把坐标变为正数
    print('prediction')
    print(prediction)   # 打印画图的3d坐标
    plt.figure()
    ax = plt.subplot(111, projection='3d')  # 创建一个三维的绘图工程
    o_x = prediction[:, 0]
    o_y = prediction[:, 1]
    o_z = prediction[:, 2]
    print(o_x)
    print(o_y)
    print(o_z)
    ax.scatter(o_x, o_y, o_z)

    temp = o_x
    x = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]]
    temp = o_y
    y = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]]
    temp = o_z
    z = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]]
    ax.plot(x, y, z)

    temp = o_x
    x = [temp[7], temp[0], temp[4], temp[5], temp[6]]
    temp = o_y
    y = [temp[7], temp[0], temp[4], temp[5], temp[6]]
    temp = o_z
    z = [temp[7], temp[0], temp[4], temp[5], temp[6]]
    ax.plot(x, y, z)

    temp = o_x
    x = [temp[0], temp[1], temp[2], temp[3]]
    temp = o_y
    y = [temp[0], temp[1], temp[2], temp[3]]
    temp = o_z
    z = [temp[0], temp[1], temp[2], temp[3]]
    ax.plot(x, y, z)

    temp = o_x
    x = [temp[7], temp[13], temp[14], temp[15]]
    temp = o_y
    y = [temp[7], temp[13], temp[14], temp[15]]
    temp = o_z
    z = [temp[7], temp[13], temp[14], temp[15]]
    ax.plot(x, y, z)

    # temp = o_x
    # x = [temp[0], temp[14]]
    # temp = o_y
    # y = [temp[0], temp[14]]
    # temp = o_z
    # z = [temp[0], temp[14]]
    # ax.plot(y, x, z)
    #
    # temp = o_x
    # x = [temp[0], temp[15]]
    # temp = o_y
    # y = [temp[0], temp[15]]
    # temp = o_z
    # z = [temp[0], temp[15]]
    # ax.plot(y, x, z)

    # 改变坐标比例的代码,该代码的效果是z坐标轴是其他坐标的两倍
    from matplotlib.pyplot import MultipleLocatort
    major_locator = MultipleLocator(0.5)
    ax.xaxis.set_major_locator(major_locator)
    ax.yaxis.set_major_locator(major_locator)
    ax.zaxis.set_major_locator(major_locator)
    ax.get_proj = lambda: np.dot(Axes3D.get_proj(ax), np.diag([0.5, 0.5, 1, 1]))

    plt.show()
Beispiel #19
0
def analyze_frame(h, frame):

    boxes, keypoints = infer.inference_on_frame(h['predictor'], frame)

    # step 4: prepare data.
    # take 2d keypoints, that's it
    # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each.
    kp = keypoints[1][0][:2, :].T  # extract (x, y) just like in prepare_data_2d_custom code

    # what to do if kp is NaN or missing data or something?
    # I guess just ignore it

    # they do this  at the end of step4. but we keep it simple, and take the data from step2 directly into a variable.
    #     output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')]
    #output_custom_canonical_bullshit = kp.astype('float32')

    # this is what happens at  the end of step4. which is a file that is loaded in the beginning of step 5.
    #     np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata)

    # this is the bullshit they do in the original script.
    # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d]
    # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True)

    # step 5: ..... all the other shit
    # starting to copy stuff over from run.py

    # extract dataset from the init dictionary
    dataset = h['dataset']
    keypoints_metadata = h['keypoints_metadata']
    keypoints_symmetry = h['keypoints_symmetry']

    kps_left = h['kps_left']
    kps_right = h['kps_right']
    joints_left = h['joints_left']
    joints_right = h['joints_right']

    # normalize
    for i in range(len(kp)):
        koord = kp[i]
        kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h'])
    #for kps in enumerate(keypoints):
    #    kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h'])

    # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense
    filter_widths = [int(x) for x in "3,3,3,3,3".split(',')]
    skeleton_num_joints = dataset.skeleton().num_joints()
    #skeleton_num_joints = 17

    causal = True
    dropout = 0.25
    channels = 1024
    dense = False

    model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                               filter_widths=filter_widths, causal=causal, dropout=dropout,
                                               channels=channels)
    model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                         filter_widths=filter_widths, causal=causal, dropout=dropout,
                                         channels=channels, dense=dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    #if args.causal:
    #    print('INFO: Using causal convolutions')
    #    causal_shift = pad
    #else:
    #    causal_shift = 0
    causal_shift = pad

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        model_pos_train = model_pos_train.cuda()

    #if args.resume or args.evaluate:
    if True:
        chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin"
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(checkpoint['epoch']))
        model_pos_train.load_state_dict(checkpoint['model_pos'])
        model_pos.load_state_dict(checkpoint['model_pos'])

        # false in our particular case... we might benefit from getting rid of model_traj,
        # unless it's super fast then we should just keep it in case we ever upgrade
        if 'model_traj' in checkpoint:
            # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
            model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1,
                                filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels,
                                dense=dense)
            if torch.cuda.is_available():
                model_traj = model_traj.cuda()
            model_traj.load_state_dict(checkpoint['model_traj'])
        else:
            model_traj = None

    test_generator = UnchunkedGenerator(None, None, kp,
                                        pad=pad, causal_shift=causal_shift, augment=False,
                                        kps_left=kps_left, kps_right=kps_right,
                                        joints_left=joints_left, joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

    # Evaluate
    def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            if not use_trajectory_model:
                model_pos.eval()
            else:
                model_traj.eval()
            N = 0
            for _, batch, batch_2d in eval_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()

                # Positional model
                if not use_trajectory_model:
                    predicted_3d_pos = model_pos(inputs_2d)
                else:
                    predicted_3d_pos = model_traj(inputs_2d)

                # Test-time augmentation (if enabled)
                if eval_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    if not use_trajectory_model:
                        predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)

                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()

                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0
                if eval_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]

                inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)

        if action is None:
            print('----------')
        else:
            print('----'+action+'----')
        e1 = (epoch_loss_3d_pos / N)*1000
        e2 = (epoch_loss_3d_pos_procrustes / N)*1000
        e3 = (epoch_loss_3d_pos_scale / N)*1000
        ev = (epoch_loss_3d_vel / N)*1000
        print('Test time augmentation:', eval_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    image_keypoints2d = kp
    gen = UnchunkedGenerator(None, None, [[image_keypoints2d]],
                             pad=pad, causal_shift=causal_shift, augment=False,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, return_predictions=True)

    # here is the data format
    # public enum VideoPose3dJointOrder
    # {
    #     HIP = 0,
    #     R_HIP = 1,
    #     R_KNEE = 2,
    #     R_FOOT = 3,
    #     L_HIP = 4,
    #     L_KNEE = 5,
    #     L_FOOT = 6,
    #     SPINE = 7,
    #     THORAX = 8,
    #     NOSE = 9,
    #     HEAD = 10,
    #     L_SHOULDER = 11,
    #     L_ELBOW = 12,
    #     L_WRIST = 13,
    #     R_SHOULDER = 14,
    #     R_ELBOW = 15,
    #     R_WRIST = 16
    # }

    # this bugs out. dunno what the hell they were trying to do.
    # anyway we can fix it by just getting width/height some other way.

    # Invert camera transformation
    cam = dataset.cameras()

    width = cam['frame'][0]['res_w']
    height = cam['frame'][0]['res_h']

    image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height)

    viz_camera = 0

    # If the ground truth is not available, take the camera extrinsic params from a random subject.
    # They are almost the same, and anyway, we only need this for visualization purposes.
    for subject in dataset.cameras():
        if 'orientation' in dataset.cameras()[subject][viz_camera]:
            rot = dataset.cameras()[subject][viz_camera]['orientation']
            break
    prediction = camera_to_world(prediction, R=rot, t=0)
    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])

    # because algo was meant for a list of frames, we take the first frame (our only frame)
    prediction3d = prediction[0]

    return prediction3d, image_keypoints2d

    # do we want to visualize? this code used to write to json and create a video for visualization
    #if args.viz_output is not None:
    if True:

        anim_output = {'Reconstruction': prediction}

        # format the data in the same format as mediapipe, so we can load it in unity with the same script
        # we need a list (frames) of lists of 3d landmarks.
        unity_landmarks = prediction.tolist()

        # how to send data? or display it?
        # maybe draw it on the webcam feed....?!?!?!


        #with open(args.output_json, "w") as json_file:
        #    json.dump(unity_landmarks, json_file)

        #if args.rendervideo == "yes":
        #    from common.visualization import render_animation
        #    render_animation(input_keypoints, keypoints_metadata, anim_output,
        #                     dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output,
        #                     limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
        #                     input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
        #                     input_video_skip=args.viz_skip)

    we_re_done_here = 1
Beispiel #20
0
def main(input_args):
    vp3d_dir = input_args.vp3d_dir
    sys.path.append(vp3d_dir)

    from common.camera import normalize_screen_coordinates
    from common.model import TemporalModel
    from common.generators import UnchunkedGenerator
    from common.arguments import parse_args

    args = parse_args()
    print(args)

    kps_left = [4, 5, 6, 11, 12, 13]
    kps_right = [1, 2, 3, 14, 15, 16]
    joints_left = [4, 5, 6, 11, 12, 13]
    joints_right = [1, 2, 3, 14, 15, 16]

    filter_widths = [int(x) for x in args.architecture.split(',')]

    num_joints_in = 17
    in_features = 2
    num_joints_out = 17
        
    model_pos = TemporalModel(num_joints_in, in_features, num_joints_out,
                                filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
                                dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2 # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        
    if args.resume or args.evaluate:
        chk_filename = os.path.join(vp3d_dir, args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(checkpoint['epoch']))
        model_pos.load_state_dict(checkpoint['model_pos'])

    # Evaluate
    def evaluate(test_generator, action=None, return_predictions=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            model_pos.eval()
            N = 0
            for _, batch, batch_2d in test_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()
                # Positional model
                predicted_3d_pos = model_pos(inputs_2d)

                # Test-time augmentation (if enabled)
                if test_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)
                    
                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()
                    
                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0    
                if test_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]
                
                inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)
                
        if action is None:
            print('----------')
        else:
            print('----'+action+'----')
        e1 = (epoch_loss_3d_pos / N)*1000
        e2 = (epoch_loss_3d_pos_procrustes / N)*1000
        e3 = (epoch_loss_3d_pos_scale / N)*1000
        ev = (epoch_loss_3d_vel / N)*1000
        print('Test time augmentation:', test_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    def get_gt_dirs(input_path, camera_id='dev3'):
        """Get all directories with ground-truth 2D human pose annotations
        """
        gt_path_list = []
        category_path_list = get_subdirs(input_path)
        for category in category_path_list:
            if os.path.basename(category) != 'Calibration':
                category_scans = get_subdirs(category)
                for category_scan in category_scans:
                    device_list = get_subdirs(category_scan)
                    for device_path in device_list:
                        if camera_id in device_path:
                            if os.path.exists(os.path.join(device_path, 'pose2d')): # 2D annotations exist
                                gt_path_list.append(device_path) # eg <root>/Lack_TV_Bench/0007_white_floor_08_04_2019_08_28_10_47/dev3
        return gt_path_list

    def get_subdirs(input_path):
        '''
        get a list of subdirectories in input_path directory
        :param input_path: parent directory (in which to get the subdirectories)
        :return:
        subdirs: list of subdirectories in input_path
        '''
        subdirs = [os.path.join(input_path, dir_i) for dir_i in os.listdir(input_path)
                   if os.path.isdir(os.path.join(input_path, dir_i))]
        subdirs.sort()
        return subdirs

    fps = 30
    frame_width = 1920.0
    frame_height = 1080.0

    h36m_joint_names = get_h36m_joint_names()
    h36m_joint_names_dict = {name: i for i, name in enumerate(h36m_joint_names)}
    joint_names = get_body25_joint_names()
    joint_names_dict = {name: i for i, name in enumerate(joint_names)}

    dataset_dir = input_args.dataset_dir
    camera_id = input_args.camera_id

    gt_dirs = get_gt_dirs(dataset_dir, camera_id)
    for i, gt_dir in enumerate(gt_dirs):
        print(f"\nProcessing {i} of {len(gt_dirs)}: {' '.join(gt_dir.split('/')[-3:-1])}")
        
        input_dir = os.path.join(gt_dir, 'predictions', 'pose2d', 'openpose')
        output_dir = os.path.join(gt_dir, 'predictions', 'pose3d', 'vp3d')
        os.makedirs(output_dir, exist_ok=True)

        json_mask = os.path.join(input_dir, 'scan_video_00000000????_keypoints.json')
        json_files = sorted(glob(json_mask))
        input_keypoints = []
        for json_file in json_files:
            with open(json_file, 'r') as f:
                pose2d = json.load(f)
            if len(pose2d["people"]) == 0:
                keypoints_op = np.zeros((19, 3))
            else:
                keypoints_op = np.array(pose2d["people"][0]["pose_keypoints_2d"]).reshape(-1, 3) # Takes first detected person every time...
            keypoints = np.zeros((17, 3))
            for i, joint_name in enumerate(h36m_joint_names):
                if joint_name == 'spine' or joint_name == 'head':
                    continue
                joint_id = joint_names_dict[joint_name]
                keypoints[i, :] = keypoints_op[joint_id, :]
            keypoints[h36m_joint_names_dict['mid hip'], :] = np.mean((keypoints[h36m_joint_names_dict['left hip'], :], keypoints[h36m_joint_names_dict['right hip'], :]), axis=0) # mid hip = mean(left hip, right hip)
            keypoints[h36m_joint_names_dict['spine'], :] = np.mean((keypoints[h36m_joint_names_dict['neck'], :], keypoints[h36m_joint_names_dict['mid hip'], :]), axis=0) # spine = mean(neck, mid hip)
            keypoints[h36m_joint_names_dict['head'], :] = np.mean((keypoints_op[joint_names_dict['left ear'], :], keypoints_op[joint_names_dict['right ear'], :]), axis=0) # head = mean(left ear, right ear)
            input_keypoints.append(keypoints)
        input_keypoints = np.array(input_keypoints)

        input_keypoints = input_keypoints[:, :, :2] # For pretrained_h36m_cpn.bin and cpn_ft_h36m_dbb

        input_keypoints[..., :2] = normalize_screen_coordinates(input_keypoints[..., :2], w=frame_width, h=frame_height)

        args.test_time_augmentation=True
        gen = UnchunkedGenerator(None, None, [input_keypoints],
                                 pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                                 kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
        prediction = evaluate(gen, return_predictions=True) # Nx17x3

        pickle.dump(prediction, open(os.path.join(output_dir, 'vp3d_output.pkl'), "wb"))