def renderRes(self, image_path, output_path, vibe_results): img0 = cv2.imread(osp.join(image_path, os.listdir(image_path)[0])) orig_height, orig_width = img0.shape[0:2] # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=False) output_img_folder = f'{image_path}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering num_frames = len(os.listdir(image_path)) frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_path, x) for x in os.listdir(image_path) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in range(len(image_file_names)): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) # ========= Save rendered video ========= # save_name = 'vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder)
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) if args.joints3dview: output_img_raw_folder = f'{image_folder}_raw_output' os.makedirs(output_img_raw_folder, exist_ok=True) output_img_joints3d_folder = f'{image_folder}_joints3d_output' os.makedirs(output_img_joints3d_folder, exist_ok=True) output_img_mesh_folder = f'{image_folder}_mesh_output' os.makedirs(output_img_mesh_folder, exist_ok=True) output_img_meshside_folder = f'{image_folder}_meshside_output' os.makedirs(output_img_meshside_folder, exist_ok=True) output_img_all_folder = f'{image_folder}_all_output' os.makedirs(output_img_all_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) length_image_files = len(image_file_names) #length_image_files = 100 for frame_idx in tqdm(range(length_image_files)): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) if args.joints3dview: img_raw = img.copy() img_joints3d = np.zeros_like(img) joints3d_list = [] for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] joints3d = person_data['joints3d'] #print('frame_verts.shape = {}\nframe_cam.shape ={}\njoints3d.shape = {}'.format( # frame_verts.shape, frame_cam.shape, joints3d.shape)) mc = mesh_color[person_id] if args.joints3dview: joints3d_list.append(joints3d) # img_joints3d = render_joints3d(joints3d, img_raw.shape) mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img_mesh = img.copy() img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.joints3dview: #img_joints3d = np.zeros_like(img_raw) if len(joints3d_list) == 0: img_joints3d = np.zeros_like(img_raw) else: joints3d = np.concatenate(joints3d_list) img_joints3d = render_joints3d(joints3d, img_raw.shape) if args.joints3dview: img_up = np.concatenate([img_raw, img_joints3d], axis=1) img_down = np.concatenate([img_mesh, side_img], axis=1) img_all = np.concatenate([img_up, img_down], axis=0) cv2.imwrite( os.path.join(output_img_raw_folder, f'{frame_idx:06d}.png'), img_raw) cv2.imwrite( os.path.join(output_img_joints3d_folder, f'{frame_idx:06d}.png'), img_joints3d) cv2.imwrite( os.path.join(output_img_mesh_folder, f'{frame_idx:06d}.png'), img_mesh) cv2.imwrite( os.path.join(output_img_meshside_folder, f'{frame_idx:06d}.png'), side_img) cv2.imwrite( os.path.join(output_img_all_folder, f'{frame_idx:06d}.png'), img_all) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) if args.joints3dview: ''' save_name_raw = f'{vid_name.replace(".mp4", "")}_raw.mp4' save_name_raw = os.path.join(output_path, save_name_raw) images_to_video(img_folder=output_img_raw_folder, output_vid_file=save_name_raw) shutil.rmtree(output_img_raw_folder) save_name_joints3d = f'{vid_name.replace(".mp4", "")}_joints3d.mp4' save_name_joints3d = os.path.join(output_path, save_name_joints3d) images_to_video(img_folder=output_img_joints3d_folder, output_vid_file=save_name_joints3d) shutil.rmtree(output_img_joints3d_folder) save_name_mesh = f'{vid_name.replace(".mp4", "")}_mesh.mp4' save_name_mesh = os.path.join(output_path, save_name_mesh) images_to_video(img_folder=output_img_mesh_folder, output_vid_file=save_name_mesh) shutil.rmtree(output_img_mesh_folder) save_name_meshside = f'{vid_name.replace(".mp4", "")}_meshside.mp4' save_name_meshside = os.path.join(output_path, save_name_meshside) images_to_video(img_folder=output_img_meshside_folder, output_vid_file=save_name_meshside) shutil.rmtree(output_img_meshside_folder) ''' save_name_all = f'{vid_name.replace(".mp4", "")}_all.mp4' save_name_all = os.path.join(output_path, save_name_all) images_to_video(img_folder=output_img_all_folder, output_vid_file=save_name_all) shutil.rmtree(output_img_all_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print(f'Input video \"{video_file}\" does not exist! Moving on to next file.') continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) # Clean-up after processing del model shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = 'test.mp4' output_folder = 'output/' output_path = os.path.join( output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) print(f'the output path is:{output_path}') image_folder, num_frames, img_shape = generative() print( f'image_folder is:{image_folder},num_frames is:{num_frames},imshape is:{img_shape}' ) orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # # tracking_method = bbox bbox_scale = 1.1 # run multi object tracker mot = MPT( device=device, batch_size=12, # 12 display=False, # true detector_type='yolo', # yolo output_format='dict', yolo_img_size=416, #416 * 416 ) tracking_results = mot(image_folder) print(f'output the result of tracking->{tracking_results}') for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # # 加载预训练模型 pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None # bboxes = n X 4 bboxes = tracking_results[person_id]['bbox'] ## frames n X 1 frames = tracking_results[person_id]['frames'] # 一个已经标准化了的图像 dataset 其总体大小为 T X 224 X 224 X 3 dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, # none scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=64, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze( 0) #在dim = 0 增加一维张量 变为: 1 X T x 244 x 244 x 3 batch = batch.to(device) batch_size, seqlen = batch.shape[:2] # 得到 seqlen是一段时间序列图像 output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) # 1*T X 3 pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) # 1*T X 72 pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) # 1*T X 10 pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) # pred_cam = torch.cat(pred_cam, dim=0) #(T, 3) pred_verts = torch.cat(pred_verts, dim=0) #(T, ,3) pred_pose = torch.cat(pred_pose, dim=0) #(T, 72) pred_betas = torch.cat(pred_betas, dim=0) #(T, 10) pred_joints3d = torch.cat(pred_joints3d, dim=0) #(T, ,3) del batch # ========= Save results to a pickle file ========= # # 由tensor 转换成 numpy pred_cam = pred_cam.cpu().numpy() #(T, 3) pred_verts = pred_verts.cpu().numpy() #(T, ,3) pred_pose = pred_pose.cpu().numpy() #(T, 72) pred_betas = pred_betas.cpu().numpy() #(T, 10) pred_joints3d = pred_joints3d.cpu().numpy() #(T, ,3) # 结果为: T X 4 orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, # 预测的 cam T X 3 bbox=bboxes, # 目标检测的结果 T X 4 img_width=orig_width, # 最初图像的宽 img_height=orig_height # 最初图像的高 ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # # 图像高和宽 resolution ## 一个 工具 真确传参数即可使用 renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] # 读取图片 img = cv2.imread(img_fname) # true if args.sideview: side_img = np.zeros_like(img) # for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] # 3D模型 mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # 输出是一张图片 img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() print('================= END =================')
def main(args): if args.device == 'cpu': device = torch.device('cpu') print('Running on CPU') else: device = torch.device('cuda') print('Running on GPU') if args.vid_file: video_file = args.vid_file if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') else: image_file = args.img_file if not os.path.isfile(image_file): exit(f'Input video \"{image_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) # output_path = os.path.join(args.output_folder, os.path.basename(video_file).split('.')[0]) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # resize video if too big # ffmpeg -i input.avi -filter:v scale=720:-1 -c:a copy output.mkv # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) import pdb pdb.set_trace # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=True) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_images' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') output_pose_folder = f'{image_folder}_poses' os.makedirs(output_pose_folder, exist_ok=True) print(f'Saving poses to {output_pose_folder}') # prepare results for rendering from numpy import save save(f'{os.path.basename(video_file)}_poses.npy', vibe_results[1]['joints3d'][:, :25, :]) print('Saving numpy poses file to' + f'{video_file}_poses.npy') frame_results = prepare_rendering_results( vibe_results, num_frames) # returns a list of dicts (one dict for each person) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] frame_pose = person_data['joints3d'][:25] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # bgr image (opencv format) img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) # import pdb; pdb.set_trace() # Create a 3D projection and save as img # pose is mirrored # plot_skeleton(output_pose_folder, frame_idx, frame_pose) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # concatenate pose img with this image before writing cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def runDemo(image_folder, output_folder, pretrained, tracker_batch_size=12, vibe_batch_size=450, wireframe=False): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') output_path = os.path.join(output_folder, os.path.basename(image_folder).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) num_frames, img_shape = img_folder_Info(image_folder) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.0 # run multi object tracker mot = MPT( device=device, batch_size=tracker_batch_size, display=False, detector_type='yolo', output_format='dict', yolo_img_size=416, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = e2e_VIBE( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = pretrained ckpt = torch.load(pretrained_file) ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} time_results = {} for person_id in tqdm(list(tracking_results.keys())): person_start_time = time.time() joints2d = None bboxes = tracking_results[person_id]['bbox'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames dataloader = DataLoader(dataset, batch_size=vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] # output = model(batch, J_regressor=J_regressor)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch person_end_time = time.time() person_time = person_end_time - person_start_time person_frame = len(frames) print(f'Person Time: {person_time:.2f}, Person FPS:{person_frame/person_time: .2f} ') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print(f'Total time spent: {total_time:.2f} seconds (including model loading time).') print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') joblib.dump(vibe_results, os.path.join(output_folder, "vibe_output.pkl")) # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) # ========= Save rendered video ========= # vid_name = os.path.basename(image_folder) save_name = 'vibe_result.mp4' save_name = os.path.join(output_folder, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) print('================= END =================')
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Configure depth and color streams pipeline = rs.pipeline() config = rs.config() config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30) config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30) # Start streaming pipeline.start(config) # ========= Run tracking ========= # bbox_scale = 1.1 # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= main loop ======================= # fourcc = cv2.VideoWriter_fourcc(*'XVID') #cap = cv2.VideoCapture('test.avi') #out = cv2.VideoWriter('output/test_5people.avi', fourcc, 30.0, (640, 360), True) #cap = cv2.VideoCapture('sample_video.mp4') out = cv2.VideoWriter('output/test_realsense_2.avi', fourcc, 10.0, (640, 480), True) # load renderer renderer = Renderer(resolution=(640, 480), orig_img=True, wireframe=args.wireframe) i = 0 time_acc = 0.0 while (True): # Capture frame-by-frame total_time = time.time() frames = pipeline.wait_for_frames() frame_orig = frames.get_color_frame() # Convert images to numpy arrays frame_orig = np.asanyarray(frame_orig.get_data()) #ret, frame_orig = cap.read() if frame_orig is None: break #for i in range(1,300): # total_time = time.time() # path = os.path.join('tmp/sample_video/',f'{i:06d}.png') # frame_orig = cv2.imread(path) orig_height, orig_width = frame_orig.shape[:2] frame = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB) frame = frame / 255. frame = frame.transpose((2, 0, 1)) frame = torch.from_numpy(frame) frame = frame.unsqueeze(0) tracking_results = mot(frame) #print('1111111111111tracking result',tracking_results) #print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in list(tracking_results.keys()): bboxes = joints2d = None bboxes = tracking_results[person_id]['bbox'] # shape(1,4) #print('bboxes: ',bboxes) #相同 frames = tracking_results[person_id]['frames'] #print('22222222',bboxes) dataset = Inference(frame=frame_orig, bboxes=bboxes, scale=bbox_scale) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] batch = dataset batch = batch.unsqueeze(0).unsqueeze(0) batch = batch.to(device) #print(batch.shape) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch pred_cam = pred_cam.cpu().numpy() #print('pred_cam: ',pred_cam) #不同 pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() #print('3333333333',pred_cam) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) #print('orig_cam',orig_cam.shape) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict #print('vibe_results orig_cam: ',vibe_results[1]['orig_cam']) #print('vibe_results pose: ', vibe_results[1]['pose']) end = time.time() fps = 1 / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') if not args.no_render: render_time = time.time() # load renderer #renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) # prepare results for rendering num_frames = 1 #print('vibe_results1111: ',vibe_results) #vibe_results[1]['orig_cam'] = vibe_results[1]['orig_cam'][np.newaxis,:] #print('orig_cam: ',vibe_results[1]['orig_cam'].shape) frame_results = prepare_rendering_results(vibe_results, num_frames) #print('frame_results',frame_results) img = frame_orig mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } #img = frame if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[0].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] #print('4444444444frame_cam',frame_cam) mc = mesh_color[person_id] mesh_filename = None img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) fps = 1 / (time.time() - render_time) print(f'RENDER FPS: {fps:.2f}') #img = img.numpy() out.write(img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() total_time = time.time() - total_time i += 1 time_acc += total_time print( 'num of frame: ', i, f' Total time spent: {total_time:.2f} seconds (detect+track+vibe+render).' ) print(f'FPS : { 1 / total_time:.2f}.') print('Total average FPS: ', i / time_acc) # ========= Save rendered video ========= # print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') """ Prepare input video (images) """ video_file = args.vid_file if video_file.startswith('https://www.youtube.com'): print(f"Donwloading YouTube video \'{video_file}\'") video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f"YouTube Video has been downloaded to {video_file}...") if not os.path.isfile(video_file): exit(f"Input video \'{video_file}\' does not exist!") output_path = osp.join('./output/demo_output', os.path.basename(video_file).replace('.mp4', '')) Path(output_path).mkdir(parents=True, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f"Input video number of frames {num_frames}\n") orig_height, orig_width = img_shape[:2] """ Run tracking """ total_time = time.time() bbox_scale = 1.2 # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] """ Get TCMR model """ seq_len = 16 model = TCMR(seqlen=seq_len, n_layers=2, hidden_size=1024).to(device) # Load pretrained weights pretrained_file = args.model ckpt = torch.load(pretrained_file) print(f"Load pretrained weights from \'{pretrained_file}\'") ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) # Change mesh gender gender = args.gender # 'neutral', 'male', 'female' model.regressor.smpl = SMPL(SMPL_MODEL_DIR, batch_size=64, create_transl=False, gender=gender).cuda() model.eval() # Get feature_extractor from lib.models.spin import hmr hmr = hmr().to(device) checkpoint = torch.load( osp.join(BASE_DATA_DIR, 'spin_model_checkpoint.pth.tar')) hmr.load_state_dict(checkpoint['model'], strict=False) hmr.eval() """ Run TCMR on each person """ print("\nRunning TCMR on each person tracklet...") tcmr_time = time.time() tcmr_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None bboxes = tracking_results[person_id]['bbox'] frames = tracking_results[person_id]['frames'] # Prepare static image features dataset = CropDataset( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False crop_dataloader = DataLoader(dataset, batch_size=256, num_workers=16) with torch.no_grad(): feature_list = [] for i, batch in enumerate(crop_dataloader): if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.to(device) feature = hmr.feature_extractor(batch.reshape(-1, 3, 224, 224)) feature_list.append(feature.cpu()) del batch feature_list = torch.cat(feature_list, dim=0) # Encode temporal features and estimate 3D human mesh dataset = FeatureDataset( image_folder=image_folder, frames=frames, seq_len=seq_len, ) dataset.feature_list = feature_list dataloader = DataLoader(dataset, batch_size=64, num_workers=32) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for i, batch in enumerate(dataloader): if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.to(device) output = model(batch)[0][-1] pred_cam.append(output['theta'][:, :3]) pred_verts.append(output['verts']) pred_pose.append(output['theta'][:, 3:75]) pred_betas.append(output['theta'][:, 75:]) pred_joints3d.append(output['kp_3d']) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() bboxes[:, 2:] = bboxes[:, 2:] * 1.2 if args.render_plain: pred_cam[:, 0], pred_cam[:, 1:] = 1, 0 # np.array([[1, 0, 0]]) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } tcmr_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - tcmr_time) print(f'TCMR FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) if args.save_pkl: print( f"Saving output results to \'{os.path.join(output_path, 'tcmr_output.pkl')}\'." ) joblib.dump(tcmr_results, os.path.join(output_path, "tcmr_output.pkl")) """ Render results as a single video """ renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' input_img_folder = f'{image_folder}_input' os.makedirs(output_img_folder, exist_ok=True) os.makedirs(input_img_folder, exist_ok=True) print(f"\nRendering output video, writing frames to {output_img_folder}") # prepare results for rendering frame_results = prepare_rendering_results(tcmr_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in tcmr_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) input_img = img.copy() if args.render_plain: img[:] = 0 if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') Path(mesh_folder).mkdir(parents=True, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') mc = mesh_color[person_id] img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # save output frames cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.jpg'), img) cv2.imwrite(os.path.join(input_img_folder, f'{frame_idx:06d}.jpg'), input_img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() """ Save rendered video """ vid_name = os.path.basename(video_file) save_name = f'tcmr_{vid_name.replace(".mp4", "")}_output.mp4' save_path = os.path.join(output_path, save_name) images_to_video(img_folder=output_img_folder, output_vid_file=save_path) images_to_video(img_folder=input_img_folder, output_vid_file=os.path.join(output_path, vid_name)) print(f"Saving result video to {os.path.abspath(save_path)}") shutil.rmtree(output_img_folder) shutil.rmtree(input_img_folder) shutil.rmtree(image_folder)
def render(orig_dim, frame_lis, vibe_results, image_folder, output_path, num_frames, args): orig_height, orig_width = orig_dim renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'live_result_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if (args.sideview): side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # save_name = f'live_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) shutil.rmtree(image_folder)
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False # reduce the num of worker if you encountered the error: DLL load failed: The paging file is too small for this operation to complete dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=8) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print(f'Total time spent: {total_time:.2f} seconds (including model loading time).') print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) font = cv2.FONT_HERSHEY_SIMPLEX x = 10 #position of text y = 20 #position of text cv2.putText(img, str(frame_idx), (x,y), font ,0.55,(0,255,0),1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) # generate and save the joints csv file for animating avatars later output = joblib.load(os.path.join(output_path, "vibe_output.pkl")) for i in output.keys(): print('Track ids:', i , end='\n\n') num_ppl = len(output.keys()) print('VIBE output file content:', end='\n\n') vid_name = os.path.basename(video_file) vibe_result_folder = output_path # output the pose result as csv # format: v_personId_numFrames pose_filename_list = [] for i in output.keys(): pose_filename = vibe_result_folder + "/" + vid_name + "_"+ str(i) + "_" + str(output[i]['pose'].shape[0]) + ".csv" pose_filename_list.append(pose_filename) field_names = [] for idx in range(73): # 72 -> 73 (+ frame_id at 0) field_names.append(str(idx)) with open(pose_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(field_names) for frame_id in range(len(output[i]['pose'])): output_data = [output[i]['frame_ids'][frame_id]] output_data.extend(output[i]['pose'][frame_id]) #print(output_data) writer.writerow(output_data) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, device=device, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file, map_location=device) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') total_time = time.time() # ========= Run VIBE on crops ========= # print(f'Running VIBE on crops...') vibe_time = time.time() image_folder = args.input_folder dataset = InferenceFromCrops(image_folder=image_folder) orig_height = orig_width = 512 dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=0) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch_num, batch in enumerate(dataloader): print("BATCH:", batch_num) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # output_path = image_folder.replace('cropped_frames', 'vibe_results') os.makedirs(output_path, exist_ok=True) pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() vibe_results = { 'pred_cam': pred_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, } del model end = time.time() fps = len(dataset) / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {len(dataset) / total_time:.2f}.' ) print( f'Saving vibe results to \"{os.path.join(output_path, "vibe_results.pkl")}\".' ) with open(os.path.join(output_path, "vibe_results.pkl"), 'wb') as f_save: pickle.dump(vibe_results, f_save) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = os.path.join(output_path, 'vibe_images') os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) frame_verts = vibe_results['verts'][frame_idx] frame_cam = vibe_results['pred_cam'][frame_idx] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'vibe_meshes') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') rend_img = renderer.render( img, frame_verts, cam=frame_cam, mesh_filename=mesh_filename, ) whole_img = rend_img if args.sideview: side_img_bg = np.zeros_like(img) side_rend_img90 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=90, axis=[0, 1, 0], ) side_rend_img270 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=270, axis=[0, 1, 0], ) if args.reposed_render: smpl = SMPL('data/vibe_data', batch_size=1) zero_pose = torch.from_numpy( np.zeros((1, pred_pose.shape[-1]))).float() zero_pose[:, 0] = np.pi pred_frame_betas = torch.from_numpy( pred_betas[frame_idx][None, :]).float() with torch.no_grad(): reposed_smpl_output = smpl( betas=pred_frame_betas, body_pose=zero_pose[:, 3:], global_orient=zero_pose[:, :3]) reposed_verts = reposed_smpl_output.vertices reposed_verts = reposed_verts.cpu().detach().numpy() reposed_cam = np.array([0.9, 0, 0]) reposed_rend_img = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam) reposed_rend_img90 = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam, angle=90, axis=[0, 1, 0]) top_row = np.concatenate( [img, reposed_rend_img, reposed_rend_img90], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) else: top_row = np.concatenate([img, side_img_bg, side_img_bg], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) # cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), whole_img) cv2.imwrite( os.path.join(output_img_folder, os.path.basename(img_fname)), whole_img) # ========= Save rendered video ========= # save_vid_path = os.path.join(output_path, 'vibe_video.mp4') print(f'Saving result video to {save_vid_path}') images_to_video(img_folder=output_img_folder, output_vid_file=save_vid_path) print('================= END =================')