def main(): parser = argparse.ArgumentParser(description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/train_FreiHAND_dataset.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 1. load network model num_joints = cfg.MODEL.NUM_JOINTS net_hm = Net_HM_HG(num_joints, num_stages=cfg.MODEL.HOURGLASS.NUM_STAGES, num_modules=cfg.MODEL.HOURGLASS.NUM_MODULES, num_feats=cfg.MODEL.HOURGLASS.NUM_FEAT_CHANNELS) load_net_model(cfg.MODEL.PRETRAIN_WEIGHT.HM_NET_PATH, net_hm) device = cfg.MODEL.DEVICE net_hm.to(device) net_hm = net_hm.train() # 2. Load data dataset_val = build_dataset(cfg.TRAIN.DATASET, cfg.TRAIN.BACKGROUND_SET, cfg.TRAIN.DATA_SIZE) print('Perform dataloader...', end='') data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=cfg.MODEL.BATCH_SIZE, shuffle=True, num_workers=cfg.MODEL.NUM_WORKERS ) print('done!') optimizer = optim.RMSprop(net_hm.parameters(), lr=10**-3) hm_loss = nn.MSELoss(reduction='sum') print('Entering loop...') num_epoch = 200 for epoch in range(num_epoch): total_loss_train = 0.0 tic = time.time() for i, batch in enumerate(data_loader_val): images, cam_params, pose_roots, pose_scales, image_ids = batch images, cam_params, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device) # ground truth heatmap gt_heatmap = torch.Tensor().to(device) for img_id in image_ids: gt_heatmap = torch.cat((gt_heatmap, dataset_val.heatmap_gts_list[img_id].to(device)), 0) gt_heatmap = gt_heatmap.view(-1, 21, 64, 64) # backpropagation optimizer.zero_grad() images = BHWC_to_BCHW(images) # B x C x H x W images = normalize_image(images) est_hm_list, _ = net_hm(images) est_hm_list = est_hm_list[-1].to(device) loss = hm_loss(est_hm_list, gt_heatmap) loss.backward() optimizer.step() total_loss_train += loss.item() # record time toc = time.time() print('loss of epoch %2d: %6.2f, time: %0.4f s' %(int(epoch+1), total_loss_train, toc-tic)) # save model weight every epoch torch.save(net_hm.state_dict(), "net_hm.pth")
from hand_shape_pose.util.vis_pose_only import draw_2d_skeleton, draw_3d_skeleton import matplotlib.pyplot as plt import numpy as np ### specify inputs ### config_file = "configs/eval_webcam.yaml" K = [[291.00602819, 0, 139.59914484], [0, 292.75184403, 111.98793194], [0, 0, 1]] # intrinsic camera parameter pose_scale = 0.03 # hand pose scale ~3cm cropped_dim = (480, 480) # cropped dimension from the origial webcam image resize_dim = (256, 256) # input image dim accepted by the learning model avg_per_frame = 1 # number of images averaged to help reduce noise ###################### cfg.merge_from_file(config_file) cfg.freeze() # Load trained network model model = MLPPoseNetwork(cfg) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg, load_mlp=True) model = model.eval() # intrinsic camera parameter K and pose_scale K = torch.tensor(K).to(device) K = K.reshape((1,3,3)) pose_scale = torch.tensor(pose_scale).to(device).reshape((1,1)) # ~3cm # webcam settings - default image size [640x480]
def main(): parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 1. Load network model path = 'webcam_output' mkdir(path) model = ShapePoseNetwork(cfg, path) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg) mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32')) # 3. Inference model.eval() cpu_device = torch.device("cpu") cap = cv2.VideoCapture(0) i = 0 start = time.time() end = 0 while (True): ret, images = cap.read() images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA) images = torch.tensor(images) images = torch.unsqueeze(images, 0) images = images.to(device) with torch.no_grad(): est_pose_uv = model(images) est_pose_uv = est_pose_uv.to(cpu_device) images = images.to(cpu_device) images = torch.squeeze(images, 0) est_pose_uv = np.asarray(est_pose_uv) images = images.numpy() est_pose_uv = est_pose_uv[0] skeleton_overlay = draw_2d_skeleton(images, est_pose_uv) cv2.imshow('result', skeleton_overlay) name = str(i) + '.jpg' cv2.imwrite(osp.join(path, name), skeleton_overlay) i = i + 1 if cv2.waitKey(1) & 0xFF == ord('q'): end = time.time() break # When everything done, release the capture cap.release() cv2.destroyAllWindows() print('FPS = {}'.format(i / (end - start)))
def main(): parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file) mkdir(output_dir) logger = setup_logger("hand_shape_pose_inference", output_dir, filename='eval-' + get_logger_filename()) logger.info(cfg) # 1. Load network model model = ShapePoseNetwork(cfg, output_dir) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg) mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32')) # 2. Load data dataset_val = build_dataset(cfg.EVAL.DATASET) data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=cfg.MODEL.BATCH_SIZE, num_workers=cfg.MODEL.NUM_WORKERS) # 3. Inference model.eval() results_pose_cam_xyz = {} cpu_device = torch.device("cuda:0") logger.info("Evaluate on {} frames:".format(len(dataset_val))) for i, batch in enumerate(data_loader_val): images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch images, cam_params, bboxes, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device) with torch.no_grad(): est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \ model(images, cam_params, bboxes, pose_roots, pose_scales) est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz] est_pose_uv = [o.to(cpu_device) for o in est_pose_uv] est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz] results_pose_cam_xyz.update({ img_id.item(): result for img_id, result in zip(image_ids, est_pose_cam_xyz) }) if i % cfg.EVAL.PRINT_FREQ == 0: # 4. evaluate pose estimation avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz, save_results=False) # cm msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format( len(results_pose_cam_xyz), len(dataset_val), avg_est_error * 10.0) logger.info(msg) # 5. visualize mesh and pose estimation if cfg.EVAL.SAVE_BATCH_IMAGES_PRED: file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i) logger.info("Saving image: {}".format(file_name)) save_batch_image_with_mesh_joints(mesh_renderer, images.to(cpu_device), cam_params.to(cpu_device), bboxes.to(cpu_device), est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz, file_name) # overall evaluate pose estimation assert len(results_pose_cam_xyz) == len(dataset_val), \ "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \ (len(results_pose_cam_xyz), len(dataset_val)) avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz, cfg.EVAL.SAVE_POSE_ESTIMATION, output_dir) # cm logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format( avg_est_error * 10.0))
def main(): pyag.FAILSAFE = False parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 1. Load network model path = 'webcam_output_bbox' mkdir(path) model = ShapePoseNetwork(cfg, path) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg) mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32')) #2. Load hand_detector hand = test.hand_detector() # 3. Inference model.eval() cpu_device = torch.device("cpu") cap = cv2.VideoCapture(0) i = 0 start = time.time() end = 0 config_dict = { 'y_up': 1.0, 'y_low': 0.5, 'x_up': 0.8, 'x_low': 0.2, 'std_threshold': 2e6, 'buffer_size': 10 } """ y_up and y_low are the height thresholds within which the keypoints are mapped to the screen. ie., keypoints are mapped from a space which is (y_max-y_low)*img_height. If keypoints are outside this region cursor position does not change. std_threshold is the threshold for std deviation. If std_deviation is above the threshold (implies hand is moving), the cursor position is the location as detected in the previous image. If stddeviation is below the threshold (hand is static), the cursor position is the moving average of buffer_size previous images. """ prev_cursor_x = [] prev_cursor_y = [] prev_cursor = None screen_size = np.asarray(pyag.size()) while (True): ret, frame = cap.read() img_h, img_w, _ = frame.shape bbox = hand.detect_hand(frame) bbox = bbox.astype(int) if bbox.size == 0: continue #If no bbox detected skip keypoint detection images = frame[bbox[0][1]:bbox[0][3], bbox[0][0]:bbox[0][2]] bb_h, bb_w, _ = images.shape images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA) images = torch.tensor(images) images = torch.unsqueeze(images, 0) images = images.to(device) with torch.no_grad(): est_pose_uv = model(images) est_pose_uv = est_pose_uv.to(cpu_device) est_pose_uv = np.asarray(est_pose_uv) est_pose_uv = est_pose_uv[0] est_pose_uv[:, 0] = est_pose_uv[:, 0] * bb_w / 256 est_pose_uv[:, 1] = est_pose_uv[:, 1] * bb_h / 256 est_pose_uv[:, 0] += bbox[0][0] est_pose_uv[:, 1] += bbox[0][1] if ((est_pose_uv[0, 1] > (img_h * config_dict['y_up'])) or (est_pose_uv[0, 1] < (img_h * config_dict['y_low']))) or ( (est_pose_uv[0, 0] > (img_w * config_dict['x_up'])) or (est_pose_uv[0, 0] < (img_w * config_dict['x_low']))): continue cursor_x = int( (est_pose_uv[0, 0] - (img_w * config_dict['x_low'])) * 1920. / (img_w * (config_dict['x_up'] - config_dict['x_low']))) cursor_y = int( (est_pose_uv[0, 1] - (img_h * config_dict['y_low'])) * 1080. / (img_h * (config_dict['y_up'] - config_dict['y_low']))) if len(prev_cursor_x) <= config_dict['buffer_size']: prev_cursor_x.append(cursor_x) prev_cursor_y.append(cursor_y) elif len(prev_cursor_x) > config_dict['buffer_size']: prev_cursor_x.append(cursor_x) prev_cursor_y.append(cursor_y) _ = prev_cursor_x.pop(0) _ = prev_cursor_y.pop(0) prev_cursor = np.column_stack((prev_cursor_x, prev_cursor_y)) mean = np.mean(prev_cursor, 0) var_dist = np.var(np.sum(((prev_cursor - mean) / screen_size)**2, 1)) if var_dist > config_dict['std_threshold']: pyag.moveTo(cursor_x, cursor_y) else: pyag.moveTo(int(mean[0]), int(mean[1])) skeleton_overlay = draw_2d_skeleton(frame, est_pose_uv) cv2.imshow('result', skeleton_overlay) #name=str(i)+'.jpg' #cv2.imwrite(osp.join(path , name), frame) i = i + 1 if cv2.waitKey(1) & 0xFF == ord('q'): end = time.time() break # When everything done, release the capture cap.release() cv2.destroyAllWindows() print('no.of frames ={}'.format(i)) print('FPS = {}'.format(i / (end - start)))
def main(): parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/train_FreiHAND_dataset.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # Load data dataset_val = build_dataset(cfg.TRAIN.DATASET, cfg.TRAIN.BACKGROUND_SET, cfg.TRAIN.DATA_SIZE) data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=cfg.MODEL.BATCH_SIZE, shuffle=True, num_workers=cfg.MODEL.NUM_WORKERS) # Load network model model = MLPPoseNetwork(cfg) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg) model = model.train() # fix hm model weight for param in model.net_hm.parameters(): param.requires_grad = False optimizer = optim.RMSprop(model.mlp.parameters(), lr=0.0001) #optimizer = optim.Adam(model.mlp.parameters(), lr=0.00001) pose_loss = nn.MSELoss(reduction='sum') num_epoch = 400 for epoch in range(num_epoch): # reduce learning rate every 50 epoch if epoch % 50 == 0 and epoch != 0: for param_group in optimizer.param_groups: param_group['lr'] /= 10 tic = time.time() total_loss_train = 0.0 tot_loss_len = 0.0 tot_loss_dir = 0.0 # model training per batcg for i, batch in enumerate(data_loader_val): images, cam_params, pose_roots, pose_scales, image_ids = batch images, cam_params, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device) # ground truth pose gt_pose_cam_xyz = torch.Tensor().to(device) for img_id in image_ids: gt_pose_cam_xyz = torch.cat( (gt_pose_cam_xyz, dataset_val.pose_gts[img_id].to(device)), 0) gt_pose_cam_xyz = gt_pose_cam_xyz.view(-1, 21, 3) # forward propagation optimizer.zero_grad() _, est_pose_uv, est_pose_cam_xyz = model(images, cam_params, pose_scales, pose_root=pose_roots) # bone constraint loss len_loss, dir_loss = bone_constraint_loss(est_pose_cam_xyz, gt_pose_cam_xyz, device) loss = pose_loss(est_pose_cam_xyz, gt_pose_cam_xyz) + len_loss + dir_loss # back propagation loss.backward() optimizer.step() total_loss_train += loss.item() tot_loss_len += len_loss.item() tot_loss_dir += dir_loss.item() # record time toc = time.time() print( 'loss of epoch %2d: %6.2f, L_len: %3.2f, L_dir: %5.2f, time: %0.4f s' % (int(epoch + 1), total_loss_train, tot_loss_len, tot_loss_dir, toc - tic)) # save model weight every epoch torch.save(model.mlp.state_dict(), "mlp.pth")
def main(): parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file) mkdir(output_dir) logger = setup_logger("hand_shape_pose_inference", output_dir, filename='eval-' + get_logger_filename()) logger.info(cfg) # 1. Load network model model = ShapePoseNetwork(cfg, output_dir) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg) faces = model.hand_tri.astype('uint32') mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32')) # 2. Load data dataset_val = build_dataset(cfg.EVAL.DATASET) data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=cfg.MODEL.BATCH_SIZE, num_workers=cfg.MODEL.NUM_WORKERS) #try adding skeleton detection import os, sys lib_path = os.path.abspath(os.path.join('..', 'HandKeyPointDetector')) sys.path.append(lib_path) from HandKeypointDetector import HandKeypointDetector hd = HandKeypointDetector('\output') count = 0 # 3. Inference model.eval() results_pose_cam_xyz = {} cpu_device = torch.device("cpu") logger.info("Evaluate on {} frames:".format(len(dataset_val))) for i, batch in enumerate(data_loader_val): images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch new_images = torch.zeros([images.shape[0], 256, 256, images.shape[3]], dtype=images.dtype) import time cur = time.time() if images[0].shape[0] > 256 and images[0].shape[1] > 256: for j in range(len(images)): bb = hd.detectKeyPoints(images[j]) Ydif = bb['maxY'] - bb['minY'] Xdif = bb['maxX'] - bb['minX'] if (Xdif > Ydif): bb['maxY'] += abs(Ydif - Xdif) // 2 bb['minY'] -= abs(Ydif - Xdif) // 2 else: bb['maxX'] += abs(Ydif - Xdif) // 2 bb['minX'] -= abs(Ydif - Xdif) // 2 t_img = images[j][bb['minY']:bb['maxY'], bb['minX']:bb['maxX'], :] # import matplotlib.pyplot as plt # plt.close('all') # plt.imshow(t_img.detach().cpu().numpy()) # plt.savefig('a.png') import torch.nn.functional as F downsampled = F.upsample(t_img.float().unsqueeze(0).permute( 0, 3, 1, 2), size=(256, 256), mode='bilinear').permute(0, 2, 3, 1).byte() new_images[j] = downsampled[0] import matplotlib.pyplot as plt plt.imshow(downsampled.squeeze().detach().cpu().numpy()) plt.savefig('output\{}.png'.format(j)) images = new_images images, cam_params, bboxes, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device) with torch.no_grad(): est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \ model(images, cam_params, bboxes, pose_roots, pose_scales) elpased = (time.time() - cur) / data_loader_val.batch_size print('average run time per frame = {:.03f}sec'.format(elpased)) est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz] est_pose_uv = [o.to(cpu_device) for o in est_pose_uv] est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz] results_pose_cam_xyz.update({ img_id.item(): result for img_id, result in zip(image_ids, est_pose_cam_xyz) }) if i % cfg.EVAL.PRINT_FREQ == 0: # 4. evaluate pose estimation avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz, save_results=False) # cm msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format( len(results_pose_cam_xyz), len(dataset_val), avg_est_error * 10.0) logger.info(msg) # 5. visualize mesh and pose estimation if cfg.EVAL.SAVE_BATCH_IMAGES_PRED: file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i) logger.info("Saving image: {}".format(file_name)) # test import numpy as np import trimesh # attach to logger so trimesh messages will be printed to console trimesh.util.attach_to_log() for j in range(len(images)): # mesh objects can be created from existing faces and vertex data ind = j import matplotlib.pyplot as plt plt.close('all') plt.imshow(images[ind].detach().cpu().numpy()[..., ::-1]) plt.savefig('a.png') mesh = trimesh.Trimesh(vertices=est_mesh_cam_xyz[ind], faces=faces) mesh.show() # end test # save_batch_image_with_mesh_joints(mesh_renderer, images.to(cpu_device), cam_params.to(cpu_device), # bboxes.to(cpu_device), est_mesh_cam_xyz, est_pose_uv, # est_pose_cam_xyz, file_name) # overall evaluate pose estimation assert len(results_pose_cam_xyz) == len(dataset_val), \ "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \ (len(results_pose_cam_xyz), len(dataset_val)) avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz, cfg.EVAL.SAVE_POSE_ESTIMATION, output_dir) # cm logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format( avg_est_error * 10.0))
def main(): parser = argparse.ArgumentParser( description="3D Hand Shape and Pose Inference") parser.add_argument( "--config-file", default="configs/eval_FreiHAND_dataset.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file) mkdir(output_dir) logger = setup_logger("hand_shape_pose_inference", output_dir, filename='eval-' + get_logger_filename()) logger.info(cfg) # Load network model model = MLPPoseNetwork(cfg) device = cfg.MODEL.DEVICE model.to(device) model.load_model(cfg, load_mlp=True) # Load data dataset_val = build_dataset(cfg.EVAL.DATASET) data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=cfg.MODEL.BATCH_SIZE, num_workers=cfg.MODEL.NUM_WORKERS) # Inference model = model.eval() results_pose_cam_xyz = {} cpu_device = torch.device("cpu") logger.info("Evaluate on {} frames:".format(len(dataset_val))) for i, batch in enumerate(data_loader_val): if cfg.EVAL.DATASET == "FreiHAND_train": images, cam_params, pose_roots, pose_scales, image_ids = batch images, cam_params, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device) with torch.no_grad(): _, est_pose_uv, est_pose_cam_xyz = model(images, cam_params, pose_scales, pose_root=pose_roots) est_pose_uv = [o.to(cpu_device) for o in est_pose_uv] est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz] elif cfg.EVAL.DATASET == "FreiHAND_test": images, cam_params, pose_scales, image_ids = batch images, cam_params, pose_scales = \ images.to(device), cam_params.to(device), pose_scales.to(device) with torch.no_grad(): _, est_pose_uv, est_pose_cam_xyz = model( images, cam_params, pose_scales) est_pose_uv = [o.to(cpu_device) for o in est_pose_uv] est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz] else: images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch images, cam_params, bboxes, pose_roots, pose_scales = \ images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device) with torch.no_grad(): _, est_pose_uv, est_pose_cam_xyz = model(images, cam_params, pose_scales, pose_root=pose_roots, bbox=bboxes) est_pose_uv = [o.to(cpu_device) for o in est_pose_uv] est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz] results_pose_cam_xyz.update({ img_id.item(): result for img_id, result in zip(image_ids, est_pose_cam_xyz) }) if i % cfg.EVAL.PRINT_FREQ == 0: # evaluate pose estimation if cfg.EVAL.DATASET != "FreiHAND_test": avg_est_error = dataset_val.evaluate_pose( results_pose_cam_xyz, save_results=False) # cm msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format( len(results_pose_cam_xyz), len(dataset_val), avg_est_error * 1000.0) logger.info(msg) # visualize mesh and pose estimation if cfg.EVAL.SAVE_BATCH_IMAGES_PRED: file_name = '{}_{}.jpg'.format( osp.join(output_dir, 'pred'), i) logger.info("Saving image: {}".format(file_name)) save_batch_image_with_mesh_joints(images.to(cpu_device), est_pose_uv, est_pose_cam_xyz, file_name) else: if cfg.EVAL.SAVE_BATCH_IMAGES_PRED: file_name = '{}_{}.jpg'.format( osp.join(output_dir, 'pred'), i) logger.info("Saving image: {}".format(file_name)) save_batch_image_with_mesh_joints(images.to(cpu_device), est_pose_uv, est_pose_cam_xyz, file_name) # overall evaluate pose estimation assert len(results_pose_cam_xyz) == len(dataset_val), \ "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \ (len(results_pose_cam_xyz), len(dataset_val)) if cfg.EVAL.DATASET != "FreiHAND_test": avg_est_error = dataset_val.evaluate_pose( results_pose_cam_xyz, cfg.EVAL.SAVE_POSE_ESTIMATION, output_dir) # cm logger.info( "Overall:\tAverage pose estimation error: {0:.2f} (mm)".format( avg_est_error * 1000.0)) threshold_list = 0.0001 * np.array(range(0, 1000, 25)) pck_list = dataset_val.evaluate_3d_pck(results_pose_cam_xyz, threshold_list) np.savetxt('pck_proposed.npy', pck_list) np.savetxt('threshold.npy', threshold_list) threshold_list = [i * 1000 for i in threshold_list] plt.figure() plt.plot(threshold_list, pck_list, label='proposed method') plt.title('Real world testset') plt.xlabel('error threshold (mm)') plt.ylabel('3D PCK') plt.legend(loc='lower right') plt.savefig('3D_PCK_proposed.png')