def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0): # kpts: (M, T, N, 2) norm_seqs = [] for kpt in kpts: norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height) norm_seqs.append(norm_kpt) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i][0] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2]) prediction_to_world.append(sub_prediction) return prediction_to_world
def interface(model_pos, keypoints, W, H): # input (N, 17, 2) return (N, 17, 3) if not isinstance(keypoints, np.ndarray): keypoints = np.array(keypoints) from common.camera import normalize_screen_coordinates_new, camera_to_world, normalize_screen_coordinates # keypoints = normalize_screen_coordinates_new(keypoints[..., :2], w=W, h=H) keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=common.pad, causal_shift=common.causal_shift, augment=True, kps_left=common.kps_left, kps_right=common.kps_right, joints_left=common.joints_left, joints_right=common.joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) prediction = camera_to_world(prediction, R=common.rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def __init__(self, path, remove_static_joints=True): super().__init__(fps=50, skeleton=h36m_skeleton) # [SWAPPED ORDER] # Load serialized dataset data = np.load(path, allow_pickle=True)['positions_3d'].item() self._cameras = Human36mDataset.__ext_gen(data) # self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): # using the same camera for each view and eliminate distorsion # cam.update(h36m_cameras_intrinsic_params[i]) cam.update(modified_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ['id', 'res_w', 'res_h']: cam[k] = np.array(v, dtype='float32') # Normalize camera frame cam['center'] = normalize_screen_coordinates( cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2 if 'translation' in cam: cam['translation'] = cam[ 'translation'] / 1000 # mm to meters # Add intrinsic parameters vector cam['intrinsic'] = np.concatenate( (cam['focal_length'], cam['center'], cam['radial_distortion'], cam['tangential_distortion'])) # # Load serialized dataset # data = np.load(path, allow_pickle=True)['positions_3d'].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { 'positions': positions, 'cameras': self._cameras[subject], } if remove_static_joints: # Bring the skeleton to 17 joints instead of the original 32 self.remove_joints( [4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) # Rewire shoulders to the correct parents self._skeleton._parents[11] = 8 self._skeleton._parents[14] = 8
def __init__(self, path, keypoints_type='cnp_ft_h36m_dbb', remove_static_joints=True): super().__init__(fps=50, skeleton=h36m_skeleton) self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ["id", "res_w", "res_h"]: cam[k] = np.array(v, dtype="float32") # Normalize camera frame cam["center"] = normalize_screen_coordinates(cam["center"], w=cam["res_w"], h=cam["res_h"]).astype("float32") cam["focal_length"] = cam["focal_length"] / cam["res_w"] * 2 if "translation" in cam: cam["translation"] = cam["translation"] / 1000 # mm to meters # Add intrinsic parameters vectors cam["intrinsic"] = np.concatenate((cam["focal_length"], cam["center"], cam["radial_distortion"], cam["tangential_distortion"])) # Load serialized dataset data = np.load(path, allow_pickle=True)["positions_3d"].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { "positions": positions, "cameras": self._cameras[subject] } print(keypoints_type) if remove_static_joints and keypoints_type == 'sh_ft_h36m' or keypoints_type == 'sh_pt_mpii': # Bring the skeleton to 16 joints instead of the original 32 joints = [] for i, x in enumerate(H36M_NAMES): if x == '' or x == 'Neck/Nose': # Remove 'Nose' to make SH and H36M 2D poses have the same dimension joints.append(i) self.remove_joints(joints) # Rewire shoulders to the correct parents self._skeleton._parents[10] = 8 self._skeleton._parents[13] = 8 else: # Bring the skeleton to 17 joints instead of the original 32 self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) # Rewire shoulders to the correct parents self._skeleton._parents[11] = 8 self._skeleton._parents[14] = 8
def preprocess(): for subject in dataset.subjects(): assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format( subject) for action in dataset[subject].keys(): assert action in keypoints[ subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format( action, subject) if 'positions_3d' not in dataset[subject][action]: continue for cam_idx in range(len(keypoints[subject][action])): # We check for >= instead of == because some videos in H3.6M contain extra frames mocap_length = dataset[subject][action]['positions_3d'][ cam_idx].shape[0] assert keypoints[subject][action][cam_idx].shape[ 0] >= mocap_length if keypoints[subject][action][cam_idx].shape[0] > mocap_length: # Shorten sequence keypoints[subject][action][cam_idx] = keypoints[subject][ action][cam_idx][:mocap_length] assert len(keypoints[subject][action]) == len( dataset[subject][action]['positions_3d']) for subject in keypoints.keys(): for action in keypoints[subject]: for cam_idx, kps in enumerate(keypoints[subject][action]): # Normalize camera frame cam = dataset.cameras()[subject][cam_idx] if args.keypoint_probs: kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) else: kps = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) keypoints[subject][action][cam_idx] = kps
def __init__(self, path, remove_static_joints=True): super(Human36mDataset, self).__init__(skeleton=h36m_skeleton, fps=50) self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ['id', 'res_w', 'res_h']: cam[k] = np.array(v, dtype='float32') # Normalize camera frame cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype( 'float32') cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2.0 if 'translation' in cam: cam['translation'] = cam['translation'] / 1000 # mm to meters # Add intrinsic parameters vector cam['intrinsic'] = np.concatenate((cam['focal_length'], cam['center'], cam['radial_distortion'], cam['tangential_distortion'])) # Load serialized dataset data = np.load(path, allow_pickle=True)['positions_3d'].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { 'positions': positions, 'cameras': self._cameras[subject], } if remove_static_joints: # Bring the skeleton to 16 joints instead of the original 32 joints = [] for i, x in enumerate(H36M_NAMES): if x == '' or x == 'Neck/Nose': # Remove 'Nose' to make SH(houglass?) and H36M 2D poses have the same dimension # if x == '': joints.append(i) self.remove_joints(joints) # Rewire shoulders to the correct parents self._skeleton._parents[10] = 8 self._skeleton._parents[13] = 8 # Set joints group self._skeleton._joints_group = h36m_skeleton_joints_group
def __init__(self, path, remove_static_joints=True): super(Human36mDataset, self).__init__(skeleton=h36m_skeleton, fps=50) # 父类初始化 self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) # h36m数据集的相机外部参数 for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) # 将相机内部参数update到cam(相机外部参数中) for k, v in cam.items(): # 取出字典中的key和value if k not in ['id', 'res_w', 'res_h']: # 如果没有这些关键字 cam[k] = np.array(v, dtype='float32') # 将value值都变成np.array的float32类型 # Normalize camera frame 归一化相机框架 cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype( 'float32') # center归一化 cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2.0 # 焦距归一化 if 'translation' in cam: # 如果是相机外部参数,含有translation关键字 cam['translation'] = cam['translation'] / 1000 # mm to meters / 由毫米变成米 # Add intrinsic parameters vector / 添加内部参数向量 # 数组拼接 cam['intrinsic'] = np.concatenate((cam['focal_length'], # 焦距 cam['center'], # 中心 cam['radial_distortion'],# 径向畸变系数 cam['tangential_distortion'])) # 切向畸变 # Load serialized dataset /负载序列化数据集 data = np.load(path, allow_pickle=True)['positions_3d'].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { 'positions': positions, # 位置 'cameras': self._cameras[subject], # 相机外部参数 } if remove_static_joints: # 删除静态关节 # Bring the skeleton to 16 joints instead of the original 32 / 让骨骼有16个关节,而不是原来的32个 joints = [] for i, x in enumerate(H36M_NAMES): # 移除“鼻子”,使SH和H36M 2D姿势具有相同的尺寸 if x == '' or x == 'Neck/Nose': # Remove 'Nose' to make SH and H36M 2D poses have the same dimension joints.append(i) self.remove_joints(joints) # Rewire shoulders to the correct parents / 把肩膀重新连接到正确的父母身上 self._skeleton._parents[10] = 8 self._skeleton._parents[13] = 8 # Set joints group 设置关节组 self._skeleton._joints_group = h36m_skeleton_joints_group
def create_2d_data(data_path, dataset): keypoints = np.load(data_path, allow_pickle=True) keypoints = keypoints['positions_2d'].item() for subject in keypoints.keys(): for action in keypoints[subject]: for cam_idx, kps in enumerate(keypoints[subject][action]): # Normalize camera frame cam = dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) keypoints[subject][action][cam_idx] = kps return keypoints
def __init__(self, path, remove_static_joints=True): super().__init__(fps=50, skeleton=h36m_skeleton) self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ["id", "res_w", "res_h"]: cam[k] = np.array(v, dtype="float32") # Normalize camera frame cam["center"] = normalize_screen_coordinates( cam["center"], w=cam["res_w"], h=cam["res_h"]).astype("float32") cam["focal_length"] = cam["focal_length"] / cam["res_w"] * 2 if "translation" in cam: cam["translation"] = cam[ "translation"] / 1000 # mm to meters # Add intrinsic parameters vectors cam["intrinsic"] = np.concatenate( (cam["focal_length"], cam["center"], cam["radial_distortion"], cam["tangential_distortion"])) # Load serialized dataset data = np.load(path, allow_pickle=True)["positions_3d"].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { "positions": positions, "cameras": self._cameras[subject] } if remove_static_joints: # Bring the skeleton to 17 joints instead of the original 32 self.remove_joints( [4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) # Rewire shoulders to the correct parents self._skeleton._parents[11] = 8 self._skeleton._parents[14] = 8
def __init__(self, path, remove_static_joints=True): super().__init__(fps=50, skeleton=h36m_skeleton) self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ['id', 'res_w', 'res_h']: cam[k] = np.array(v, dtype='float32') # Normalize camera frame cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') cam['focal_length'] = cam['focal_length']/cam['res_w']*2 if 'translation' in cam: cam['translation'] = cam['translation']/1000 # mm to meters # Add intrinsic parameters vector cam['intrinsic'] = np.concatenate((cam['focal_length'], cam['center'], cam['radial_distortion'], cam['tangential_distortion'])) # Load serialized dataset data = np.load(path)['positions_3d'].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { 'positions': positions, 'cameras': self._cameras[subject], } if remove_static_joints: # Bring the skeleton to 17 joints instead of the original 32 self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) # Rewire shoulders to the correct parents self._skeleton._parents[11] = 8 self._skeleton._parents[14] = 8
def init_keypoints(self): self.keypoints = np.load(self.dataset_path, allow_pickle=True) keypoints_metadata = self.keypoints['metadata'].item() keypoints_symmetry = keypoints_metadata['keypoints_symmetry'] self.keypoints_left, self.keypoints_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) self.joints_left, self.joints_right = list( self.dataset.skeleton().joints_left()), list( self.dataset.skeleton().joints_right()) self.keypoints = self.keypoints['positions_2d'].item() for subject in self.keypoints.keys(): for action in self.keypoints[subject]: for cam_idx, kps in enumerate(self.keypoints[subject][action]): # Normalize camera frame cam = self.dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) self.keypoints[subject][action][cam_idx] = kps
def gen_pose(kpts, valid_frames, width, height, model_pos, pad, causal_shift=0): assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape) assert kpts.shape[0] == len(valid_frames) norm_seqs = [] for index, frames in enumerate(valid_frames): seq_kps = kpts[index, frames] norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height) norm_seqs.append(norm_seq_kps) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1) # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2]) prediction_to_world.append(sub_prediction) # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32) return prediction_to_world
def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0): # input (N, 17, 2) return (N, 17, 3) if not isinstance(kpts, np.ndarray): kpts = np.array(kpts) keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction = camera_to_world(prediction[0], R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
if keypoints[subject][action][cam_idx].shape[0] > mocap_length: # Shorten sequence keypoints[subject][action][cam_idx] = keypoints[subject][ action][cam_idx][:mocap_length] assert len(keypoints[subject][action]) == len( dataset[subject][action]['positions_3d']) for subject in keypoints.keys(): for action in keypoints[subject]: for cam_idx, kps in enumerate(keypoints[subject][action]): # Normalize camera frame cam = dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) keypoints[subject][action][cam_idx] = kps subjects_train = args['subjects_train'].split(',') subjects_semi = [] if not args['subjects_unlabeled'] else args[ 'subjects_unlabeled'].split(',') subjects_test = args['subjects_test'].split(',') def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True): out_poses_3d = [] out_poses_2d = [] out_camera_params = [] for subject in subjects: for action in keypoints[subject].keys():
def reconstruction(args): """ Generate 3D poses from 2D keypoints detected from video, and visualize it :param chk_file: The file path of model weight :param kps_file: The file path of 2D keypoints :param viz_output: The output path of animation :param video_path: The input video path :param kpts_format: The format of 2D keypoints, like MSCOCO, MPII, H36M, OpenPose. The default format is H36M """ print('Loading 2D keypoints ...') keypoints, scores, _, _ = load_json(args.keypoints_file) # Loading only one person's keypoints if len(keypoints.shape) == 4: keypoints = keypoints[0] assert len(keypoints.shape) == 3 # Transform the keypoints format from different dataset (MSCOCO, MPII) to h36m format if args.kpts_format == 'coco': keypoints, valid_frames = coco_h36m(keypoints) elif args.kpts_format == 'mpii': keypoints, valid_frames = mpii_h36m(keypoints) elif args.kpts_format == 'openpose': # Convert 'Openpose' format to MSCOCO order_coco = [i for i in range(17) if i != 1] keypoints = keypoints[:order_coco] keypoints, valid_frames = coco_h36m(keypoints) else: valid_frames = np.where( np.sum(keypoints.reshape(-1, 34), axis=1) != 0)[0] assert args.kpts_format == 'h36m' # Get the width and height of video cap = cv2.VideoCapture(args.video_path) width = int(round(cap.get(cv2.CAP_PROP_FRAME_WIDTH))) height = int(round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))) # normalize keypoints input_keypoints = normalize_screen_coordinates(keypoints[..., :2], w=width, h=height) if args.frames == 27: filter_widths = [3, 3, 3] channels = 128 elif args.frames == 81: filter_widths = [3, 3, 3, 3] channels = 64 else: filter_widths = [3, 3, 3, 3, 3] channels = 32 model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filter_widths, channels=channels, dropout=0.05) if torch.cuda.is_available(): model_pos = model_pos.cuda() # load trained model print('Loading checkpoint', args.weight) chk_file = os.path.join('./checkpoint', args.weight) checkpoint = torch.load(chk_file, map_location=lambda storage, loc: storage) model_pos.load_state_dict(checkpoint['model_pos']) receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Reconstructing ...') gen = UnchunkedGenerator(None, None, [input_keypoints[valid_frames]], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) prediction_new = np.zeros((*input_keypoints.shape[:-1], 3), dtype=np.float32) prediction_new[valid_frames] = prediction print('Rendering ...') anim_output = {'Reconstruction': prediction_new} render_animation(keypoints, keypoints_metadata, anim_output, h36m_skeleton, 25, 3000, np.array(70., dtype=np.float32), args.viz_output, limit=-1, downsample=1, size=5, input_video_path=args.video_path, viewport=(width, height), input_video_skip=0)
def __init__(self, path, remove_static_joints=True): super().__init__(fps=50, skeleton=h36m_skeleton) self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) for cameras in self._cameras.values(): for i, cam in enumerate(cameras): cam.update(h36m_cameras_intrinsic_params[i]) for k, v in cam.items(): if k not in ['id', 'res_w', 'res_h']: cam[k] = np.array(v, dtype='float32') # Normalize camera frame cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') cam['focal_length'] = cam['focal_length']/cam['res_w']*2 if 'translation' in cam: cam['translation'] = cam['translation']/1000 # mm to meters # Add intrinsic parameters vector cam['intrinsic'] = np.concatenate((cam['focal_length'], cam['center'], cam['radial_distortion'], cam['tangential_distortion'])) # Load serialized dataset data = np.load(path)['positions_3d'].item() self._data = {} for subject, actions in data.items(): self._data[subject] = {} for action_name, positions in actions.items(): self._data[subject][action_name] = { 'positions': positions, 'cameras': self._cameras[subject], } if remove_static_joints: print("Remove static joints") # Bring the skeleton to 17 joints instead of the original 32 self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) # Rewire shoulders to the correct parents self._skeleton._parents[11] = 8 self._skeleton._parents[14] = 8 #print("parents", self._skeleton.parents()) # self._skeleton._meta[PELVIS] = {"location":"center", "joint":"pelvis"} self._skeleton._meta[SPINE] = {"location":"center", "joint":"spine"} self._skeleton._meta[CHEST] = {"location":"center", "joint":"chest"} self._skeleton._meta[NECK] = {"location":"center", "joint":"neck"} self._skeleton._meta[HEAD] = {"location":"center", "joint":"head"} self._skeleton._meta[L_HIP] = {"location":"left", "joint":"hip"} self._skeleton._meta[L_KNEE] = {"location":"left", "joint":"knee"} self._skeleton._meta[L_FOOT] = {"location":"left", "joint":"foot"} self._skeleton._meta[L_SHOULDER] = {"location":"left", "joint":"shoulder"} self._skeleton._meta[L_ELBOW] = {"location":"left", "joint":"elbow"} self._skeleton._meta[L_HAND] = {"location":"left", "joint":"hand"} self._skeleton._meta[R_HIP] = {"location":"right", "joint":"hip"} self._skeleton._meta[R_KNEE] = {"location":"right", "joint":"knee"} self._skeleton._meta[R_FOOT] = {"location":"right", "joint":"foot"} self._skeleton._meta[R_SHOULDER] = {"location":"right", "joint":"shoulder"} self._skeleton._meta[R_ELBOW] = {"location":"right", "joint":"elbow"} self._skeleton._meta[R_HAND] = {"location":"right", "joint":"hand"}
def videpose_infer(args): from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.model import TemporalModel from common.utils import Timer, evaluate, add_path from videopose import get_detector_2d, ckpt_time, metadata, time0 import gene_npz gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut") print(gene_npz.args) # detector_2d = get_detector_2d(args.detector_2d) detector_2d = gene_npz.generate_kpts(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) joints_left, joints_right = list( [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates( keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points np.save(args.viz_output / "test_3d_output.npy", prediction, allow_pickle=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates( input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt)) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
def main(): dataset_path = "./data/data_3d_h36m.npz" # 加载数据 from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path) dataset = read_3d_data(dataset) cudnn.benchmark = True device = torch.device("cpu") from models.sem_gcn import SemGCN from common.graph_utils import adj_mx_from_skeleton p_dropout = None adj = adj_mx_from_skeleton(dataset.skeleton()) model_pos = SemGCN(adj, 128, num_layers=4, p_dropout=p_dropout, nodes_group=dataset.skeleton().joints_group()).to(device) ckpt_path = "./checkpoint/pretrained/ckpt_semgcn_nonlocal_sh.pth.tar" ckpt = torch.load(ckpt_path, map_location='cpu') model_pos.load_state_dict(ckpt['state_dict'], False) model_pos.eval() # ============ 新增代码 ============== # 从项目处理2d数据的代码中输出的一个人体数据 inputs_2d = [[483.0, 450], [503, 450], [503, 539], [496, 622], [469, 450], [462, 546], [469, 622], [483, 347], [483, 326], [489, 264], [448, 347], [448, 408], [441, 463], [517, 347], [524, 408], [538, 463]] # # openpose的测试样例识别结果 # inputs_2d = [[86.0, 137], [99, 128], [94, 127], [97, 110], [89, 105], [102, 129], [116, 116], [99, 110], # [105, 93], [117, 69], [147, 63], [104, 93], [89, 69], [82, 38], [89, 139], [94, 140]] inputs_2d = np.array(inputs_2d) # inputs_2d[:, 1] = np.max(inputs_2d[:, 1]) - inputs_2d[:, 1] # 变成正的人体姿态,原始数据为倒立的 cam = dataset.cameras()['S1'][0] # 获取相机参数 inputs_2d[..., :2] = normalize_screen_coordinates(inputs_2d[..., :2], w=cam['res_w'], h=cam['res_h']) # 2d坐标处理 # 画出归一化屏幕坐标并且标记序号的二维关键点图像 print(inputs_2d) # 打印归一化后2d关键点坐标 d_x = inputs_2d[:, 0] d_y = inputs_2d[:, 1] plt.figure() plt.scatter(d_x, d_y) for i, txt in enumerate(np.arange(inputs_2d.shape[0])): plt.annotate(txt, (d_x[i], d_y[i])) # 标号 # plt.show() # 显示2d关键点归一化后的图像 # 获取3d结果 inputs_2d = torch.tensor(inputs_2d, dtype=torch.float32) # 转换为张量 outputs_3d = model_pos(inputs_2d).cpu() # 加载模型 outputs_3d[:, :, :] -= outputs_3d[:, :1, :] # Remove global offset / 移除全球偏移 predictions = [outputs_3d.detach().numpy()] # 预测结果 prediction = np.concatenate(predictions)[0] # 累加取第一个 # Invert camera transformation / 反相机的转换 prediction = camera_to_world(prediction, R=cam['orientation'], t=0) # R和t的参数设置影响不大,有多种写法和选取的相机参数有关,有些S没有t等等问题 prediction[:, 2] -= np.min(prediction[:, 2]) # 向上偏移min(prediction[:, 2]),作用是把坐标变为正数 print('prediction') print(prediction) # 打印画图的3d坐标 plt.figure() ax = plt.subplot(111, projection='3d') # 创建一个三维的绘图工程 o_x = prediction[:, 0] o_y = prediction[:, 1] o_z = prediction[:, 2] print(o_x) print(o_y) print(o_z) ax.scatter(o_x, o_y, o_z) temp = o_x x = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] temp = o_y y = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] temp = o_z z = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] ax.plot(x, y, z) temp = o_x x = [temp[7], temp[0], temp[4], temp[5], temp[6]] temp = o_y y = [temp[7], temp[0], temp[4], temp[5], temp[6]] temp = o_z z = [temp[7], temp[0], temp[4], temp[5], temp[6]] ax.plot(x, y, z) temp = o_x x = [temp[0], temp[1], temp[2], temp[3]] temp = o_y y = [temp[0], temp[1], temp[2], temp[3]] temp = o_z z = [temp[0], temp[1], temp[2], temp[3]] ax.plot(x, y, z) temp = o_x x = [temp[7], temp[13], temp[14], temp[15]] temp = o_y y = [temp[7], temp[13], temp[14], temp[15]] temp = o_z z = [temp[7], temp[13], temp[14], temp[15]] ax.plot(x, y, z) # temp = o_x # x = [temp[0], temp[14]] # temp = o_y # y = [temp[0], temp[14]] # temp = o_z # z = [temp[0], temp[14]] # ax.plot(y, x, z) # # temp = o_x # x = [temp[0], temp[15]] # temp = o_y # y = [temp[0], temp[15]] # temp = o_z # z = [temp[0], temp[15]] # ax.plot(y, x, z) # 改变坐标比例的代码,该代码的效果是z坐标轴是其他坐标的两倍 from matplotlib.pyplot import MultipleLocatort major_locator = MultipleLocator(0.5) ax.xaxis.set_major_locator(major_locator) ax.yaxis.set_major_locator(major_locator) ax.zaxis.set_major_locator(major_locator) ax.get_proj = lambda: np.dot(Axes3D.get_proj(ax), np.diag([0.5, 0.5, 1, 1])) plt.show()
def analyze_frame(h, frame): boxes, keypoints = infer.inference_on_frame(h['predictor'], frame) # step 4: prepare data. # take 2d keypoints, that's it # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each. kp = keypoints[1][0][:2, :].T # extract (x, y) just like in prepare_data_2d_custom code # what to do if kp is NaN or missing data or something? # I guess just ignore it # they do this at the end of step4. but we keep it simple, and take the data from step2 directly into a variable. # output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')] #output_custom_canonical_bullshit = kp.astype('float32') # this is what happens at the end of step4. which is a file that is loaded in the beginning of step 5. # np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata) # this is the bullshit they do in the original script. # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d] # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) # step 5: ..... all the other shit # starting to copy stuff over from run.py # extract dataset from the init dictionary dataset = h['dataset'] keypoints_metadata = h['keypoints_metadata'] keypoints_symmetry = h['keypoints_symmetry'] kps_left = h['kps_left'] kps_right = h['kps_right'] joints_left = h['joints_left'] joints_right = h['joints_right'] # normalize for i in range(len(kp)): koord = kp[i] kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h']) #for kps in enumerate(keypoints): # kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h']) # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense filter_widths = [int(x) for x in "3,3,3,3,3".split(',')] skeleton_num_joints = dataset.skeleton().num_joints() #skeleton_num_joints = 17 causal = True dropout = 0.25 channels = 1024 dense = False model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels) model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side #if args.causal: # print('INFO: Using causal convolutions') # causal_shift = pad #else: # causal_shift = 0 causal_shift = pad model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() #if args.resume or args.evaluate: if True: chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin" print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) # false in our particular case... we might benefit from getting rid of model_traj, # unless it's super fast then we should just keep it in case we ever upgrade if 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(None, None, kp, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev image_keypoints2d = kp gen = UnchunkedGenerator(None, None, [[image_keypoints2d]], pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # here is the data format # public enum VideoPose3dJointOrder # { # HIP = 0, # R_HIP = 1, # R_KNEE = 2, # R_FOOT = 3, # L_HIP = 4, # L_KNEE = 5, # L_FOOT = 6, # SPINE = 7, # THORAX = 8, # NOSE = 9, # HEAD = 10, # L_SHOULDER = 11, # L_ELBOW = 12, # L_WRIST = 13, # R_SHOULDER = 14, # R_ELBOW = 15, # R_WRIST = 16 # } # this bugs out. dunno what the hell they were trying to do. # anyway we can fix it by just getting width/height some other way. # Invert camera transformation cam = dataset.cameras() width = cam['frame'][0]['res_w'] height = cam['frame'][0]['res_h'] image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height) viz_camera = 0 # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][viz_camera]: rot = dataset.cameras()[subject][viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # because algo was meant for a list of frames, we take the first frame (our only frame) prediction3d = prediction[0] return prediction3d, image_keypoints2d # do we want to visualize? this code used to write to json and create a video for visualization #if args.viz_output is not None: if True: anim_output = {'Reconstruction': prediction} # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. unity_landmarks = prediction.tolist() # how to send data? or display it? # maybe draw it on the webcam feed....?!?!?! #with open(args.output_json, "w") as json_file: # json.dump(unity_landmarks, json_file) #if args.rendervideo == "yes": # from common.visualization import render_animation # render_animation(input_keypoints, keypoints_metadata, anim_output, # dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), # input_video_skip=args.viz_skip) we_re_done_here = 1
def main(input_args): vp3d_dir = input_args.vp3d_dir sys.path.append(vp3d_dir) from common.camera import normalize_screen_coordinates from common.model import TemporalModel from common.generators import UnchunkedGenerator from common.arguments import parse_args args = parse_args() print(args) kps_left = [4, 5, 6, 11, 12, 13] kps_right = [1, 2, 3, 14, 15, 16] joints_left = [4, 5, 6, 11, 12, 13] joints_right = [1, 2, 3, 14, 15, 16] filter_widths = [int(x) for x in args.architecture.split(',')] num_joints_in = 17 in_features = 2 num_joints_out = 17 model_pos = TemporalModel(num_joints_in, in_features, num_joints_out, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(vp3d_dir, args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) # Evaluate def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model predicted_3d_pos = model_pos(inputs_2d) # Test-time augmentation (if enabled) if test_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if test_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', test_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev def get_gt_dirs(input_path, camera_id='dev3'): """Get all directories with ground-truth 2D human pose annotations """ gt_path_list = [] category_path_list = get_subdirs(input_path) for category in category_path_list: if os.path.basename(category) != 'Calibration': category_scans = get_subdirs(category) for category_scan in category_scans: device_list = get_subdirs(category_scan) for device_path in device_list: if camera_id in device_path: if os.path.exists(os.path.join(device_path, 'pose2d')): # 2D annotations exist gt_path_list.append(device_path) # eg <root>/Lack_TV_Bench/0007_white_floor_08_04_2019_08_28_10_47/dev3 return gt_path_list def get_subdirs(input_path): ''' get a list of subdirectories in input_path directory :param input_path: parent directory (in which to get the subdirectories) :return: subdirs: list of subdirectories in input_path ''' subdirs = [os.path.join(input_path, dir_i) for dir_i in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, dir_i))] subdirs.sort() return subdirs fps = 30 frame_width = 1920.0 frame_height = 1080.0 h36m_joint_names = get_h36m_joint_names() h36m_joint_names_dict = {name: i for i, name in enumerate(h36m_joint_names)} joint_names = get_body25_joint_names() joint_names_dict = {name: i for i, name in enumerate(joint_names)} dataset_dir = input_args.dataset_dir camera_id = input_args.camera_id gt_dirs = get_gt_dirs(dataset_dir, camera_id) for i, gt_dir in enumerate(gt_dirs): print(f"\nProcessing {i} of {len(gt_dirs)}: {' '.join(gt_dir.split('/')[-3:-1])}") input_dir = os.path.join(gt_dir, 'predictions', 'pose2d', 'openpose') output_dir = os.path.join(gt_dir, 'predictions', 'pose3d', 'vp3d') os.makedirs(output_dir, exist_ok=True) json_mask = os.path.join(input_dir, 'scan_video_00000000????_keypoints.json') json_files = sorted(glob(json_mask)) input_keypoints = [] for json_file in json_files: with open(json_file, 'r') as f: pose2d = json.load(f) if len(pose2d["people"]) == 0: keypoints_op = np.zeros((19, 3)) else: keypoints_op = np.array(pose2d["people"][0]["pose_keypoints_2d"]).reshape(-1, 3) # Takes first detected person every time... keypoints = np.zeros((17, 3)) for i, joint_name in enumerate(h36m_joint_names): if joint_name == 'spine' or joint_name == 'head': continue joint_id = joint_names_dict[joint_name] keypoints[i, :] = keypoints_op[joint_id, :] keypoints[h36m_joint_names_dict['mid hip'], :] = np.mean((keypoints[h36m_joint_names_dict['left hip'], :], keypoints[h36m_joint_names_dict['right hip'], :]), axis=0) # mid hip = mean(left hip, right hip) keypoints[h36m_joint_names_dict['spine'], :] = np.mean((keypoints[h36m_joint_names_dict['neck'], :], keypoints[h36m_joint_names_dict['mid hip'], :]), axis=0) # spine = mean(neck, mid hip) keypoints[h36m_joint_names_dict['head'], :] = np.mean((keypoints_op[joint_names_dict['left ear'], :], keypoints_op[joint_names_dict['right ear'], :]), axis=0) # head = mean(left ear, right ear) input_keypoints.append(keypoints) input_keypoints = np.array(input_keypoints) input_keypoints = input_keypoints[:, :, :2] # For pretrained_h36m_cpn.bin and cpn_ft_h36m_dbb input_keypoints[..., :2] = normalize_screen_coordinates(input_keypoints[..., :2], w=frame_width, h=frame_height) args.test_time_augmentation=True gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # Nx17x3 pickle.dump(prediction, open(os.path.join(output_dir, 'vp3d_output.pkl'), "wb"))