Example #1
0
def main():
    parser = argparse.ArgumentParser(description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/train_FreiHAND_dataset.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # 1. load network model
    num_joints = cfg.MODEL.NUM_JOINTS
    net_hm = Net_HM_HG(num_joints,
                       num_stages=cfg.MODEL.HOURGLASS.NUM_STAGES,
                       num_modules=cfg.MODEL.HOURGLASS.NUM_MODULES,
                       num_feats=cfg.MODEL.HOURGLASS.NUM_FEAT_CHANNELS)
    load_net_model(cfg.MODEL.PRETRAIN_WEIGHT.HM_NET_PATH, net_hm)
    device = cfg.MODEL.DEVICE
    net_hm.to(device)
    net_hm = net_hm.train()

    # 2. Load data

    dataset_val = build_dataset(cfg.TRAIN.DATASET, cfg.TRAIN.BACKGROUND_SET, cfg.TRAIN.DATA_SIZE)
    print('Perform dataloader...', end='')
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        shuffle=True,
        num_workers=cfg.MODEL.NUM_WORKERS
    )
    print('done!')

    optimizer = optim.RMSprop(net_hm.parameters(), lr=10**-3)
    hm_loss = nn.MSELoss(reduction='sum')

    print('Entering loop...')
    num_epoch = 200
    for epoch in range(num_epoch):
        total_loss_train = 0.0
        tic = time.time()
        for i, batch in enumerate(data_loader_val):
            images, cam_params, pose_roots, pose_scales, image_ids = batch
            images, cam_params, pose_roots, pose_scales = \
                images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device)

            # ground truth heatmap
            gt_heatmap = torch.Tensor().to(device)
            for img_id in image_ids:
                gt_heatmap = torch.cat((gt_heatmap, dataset_val.heatmap_gts_list[img_id].to(device)), 0)
            gt_heatmap = gt_heatmap.view(-1, 21, 64, 64)

            # backpropagation
            optimizer.zero_grad()
            images = BHWC_to_BCHW(images)  # B x C x H x W
            images = normalize_image(images)
            est_hm_list, _ = net_hm(images)
            est_hm_list = est_hm_list[-1].to(device)
            loss = hm_loss(est_hm_list, gt_heatmap)
            loss.backward()
            optimizer.step()
            total_loss_train += loss.item()

        # record time
        toc = time.time()
        print('loss of epoch %2d: %6.2f, time: %0.4f s' %(int(epoch+1), total_loss_train, toc-tic))

        # save model weight every epoch
        torch.save(net_hm.state_dict(), "net_hm.pth")
import matplotlib.pyplot as plt
import numpy as np

### specify inputs ###
config_file = "configs/eval_webcam.yaml"
K = [[291.00602819, 0, 139.59914484], [0, 292.75184403, 111.98793194], [0, 0, 1]]   # intrinsic camera parameter
pose_scale = 0.03           # hand pose scale ~3cm
cropped_dim = (480, 480)    # cropped dimension from the origial webcam image
resize_dim = (256, 256)     # input image dim accepted by the learning model
avg_per_frame = 1           # number of images averaged to help reduce noise

######################
 
cfg.merge_from_file(config_file)
cfg.freeze()

# Load trained network model
model = MLPPoseNetwork(cfg)
device = cfg.MODEL.DEVICE
model.to(device)
model.load_model(cfg, load_mlp=True)
model = model.eval()

# intrinsic camera parameter K and pose_scale
K = torch.tensor(K).to(device)
K = K.reshape((1,3,3))
pose_scale = torch.tensor(pose_scale).to(device).reshape((1,1)) # ~3cm

# webcam settings - default image size [640x480]
cap = cv2.VideoCapture(0)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # 1. Load network model
    path = 'webcam_output'
    mkdir(path)
    model = ShapePoseNetwork(cfg, path)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 3. Inference
    model.eval()
    cpu_device = torch.device("cpu")
    cap = cv2.VideoCapture(0)
    i = 0
    start = time.time()
    end = 0
    while (True):
        ret, images = cap.read()

        images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA)
        images = torch.tensor(images)
        images = torch.unsqueeze(images, 0)
        images = images.to(device)
        with torch.no_grad():
            est_pose_uv = model(images)
            est_pose_uv = est_pose_uv.to(cpu_device)
        images = images.to(cpu_device)
        images = torch.squeeze(images, 0)
        est_pose_uv = np.asarray(est_pose_uv)
        images = images.numpy()
        est_pose_uv = est_pose_uv[0]
        skeleton_overlay = draw_2d_skeleton(images, est_pose_uv)
        cv2.imshow('result', skeleton_overlay)
        name = str(i) + '.jpg'
        cv2.imwrite(osp.join(path, name), skeleton_overlay)
        i = i + 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            end = time.time()
            break


# When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
    print('FPS = {}'.format(i / (end - start)))
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file)
    mkdir(output_dir)
    logger = setup_logger("hand_shape_pose_inference",
                          output_dir,
                          filename='eval-' + get_logger_filename())
    logger.info(cfg)

    # 1. Load network model
    model = ShapePoseNetwork(cfg, output_dir)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 2. Load data
    dataset_val = build_dataset(cfg.EVAL.DATASET)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        num_workers=cfg.MODEL.NUM_WORKERS)

    # 3. Inference
    model.eval()
    results_pose_cam_xyz = {}
    cpu_device = torch.device("cuda:0")
    logger.info("Evaluate on {} frames:".format(len(dataset_val)))
    for i, batch in enumerate(data_loader_val):
        images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch
        images, cam_params, bboxes, pose_roots, pose_scales = \
            images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device)
        with torch.no_grad():
            est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \
                model(images, cam_params, bboxes, pose_roots, pose_scales)

            est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz]
            est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
            est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]

        results_pose_cam_xyz.update({
            img_id.item(): result
            for img_id, result in zip(image_ids, est_pose_cam_xyz)
        })

        if i % cfg.EVAL.PRINT_FREQ == 0:
            # 4. evaluate pose estimation
            avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                                      save_results=False)  # cm
            msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format(
                len(results_pose_cam_xyz), len(dataset_val),
                avg_est_error * 10.0)
            logger.info(msg)

            # 5. visualize mesh and pose estimation
            if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i)
                logger.info("Saving image: {}".format(file_name))
                save_batch_image_with_mesh_joints(mesh_renderer,
                                                  images.to(cpu_device),
                                                  cam_params.to(cpu_device),
                                                  bboxes.to(cpu_device),
                                                  est_mesh_cam_xyz,
                                                  est_pose_uv,
                                                  est_pose_cam_xyz, file_name)

    # overall evaluate pose estimation
    assert len(results_pose_cam_xyz) == len(dataset_val), \
        "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \
        (len(results_pose_cam_xyz), len(dataset_val))

    avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                              cfg.EVAL.SAVE_POSE_ESTIMATION,
                                              output_dir)  # cm
    logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format(
        avg_est_error * 10.0))
Example #5
0
def main():
    pyag.FAILSAFE = False

    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # 1. Load network model
    path = 'webcam_output_bbox'
    mkdir(path)
    model = ShapePoseNetwork(cfg, path)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    #2. Load hand_detector
    hand = test.hand_detector()

    # 3. Inference
    model.eval()
    cpu_device = torch.device("cpu")
    cap = cv2.VideoCapture(0)
    i = 0
    start = time.time()
    end = 0
    config_dict = {
        'y_up': 1.0,
        'y_low': 0.5,
        'x_up': 0.8,
        'x_low': 0.2,
        'std_threshold': 2e6,
        'buffer_size': 10
    }
    """
	y_up and y_low are the height thresholds within which the keypoints are mapped to the screen.
	ie., keypoints are mapped from a space which is (y_max-y_low)*img_height. If keypoints are outside this region cursor position does not change.
	std_threshold is the threshold for std deviation. If std_deviation is above the threshold (implies hand is moving),
	the cursor position is the location as detected in the previous image. If stddeviation is below the threshold (hand is static),
	the cursor position is the moving average of buffer_size previous images.
	"""
    prev_cursor_x = []
    prev_cursor_y = []
    prev_cursor = None
    screen_size = np.asarray(pyag.size())
    while (True):
        ret, frame = cap.read()
        img_h, img_w, _ = frame.shape
        bbox = hand.detect_hand(frame)
        bbox = bbox.astype(int)
        if bbox.size == 0:
            continue  #If no bbox detected skip keypoint detection
        images = frame[bbox[0][1]:bbox[0][3], bbox[0][0]:bbox[0][2]]
        bb_h, bb_w, _ = images.shape
        images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA)
        images = torch.tensor(images)
        images = torch.unsqueeze(images, 0)
        images = images.to(device)
        with torch.no_grad():
            est_pose_uv = model(images)
            est_pose_uv = est_pose_uv.to(cpu_device)
        est_pose_uv = np.asarray(est_pose_uv)
        est_pose_uv = est_pose_uv[0]
        est_pose_uv[:, 0] = est_pose_uv[:, 0] * bb_w / 256
        est_pose_uv[:, 1] = est_pose_uv[:, 1] * bb_h / 256

        est_pose_uv[:, 0] += bbox[0][0]
        est_pose_uv[:, 1] += bbox[0][1]
        if ((est_pose_uv[0, 1] > (img_h * config_dict['y_up'])) or
            (est_pose_uv[0, 1] < (img_h * config_dict['y_low']))) or (
                (est_pose_uv[0, 0] > (img_w * config_dict['x_up'])) or
                (est_pose_uv[0, 0] < (img_w * config_dict['x_low']))):
            continue
        cursor_x = int(
            (est_pose_uv[0, 0] - (img_w * config_dict['x_low'])) * 1920. /
            (img_w * (config_dict['x_up'] - config_dict['x_low'])))
        cursor_y = int(
            (est_pose_uv[0, 1] - (img_h * config_dict['y_low'])) * 1080. /
            (img_h * (config_dict['y_up'] - config_dict['y_low'])))
        if len(prev_cursor_x) <= config_dict['buffer_size']:
            prev_cursor_x.append(cursor_x)
            prev_cursor_y.append(cursor_y)
        elif len(prev_cursor_x) > config_dict['buffer_size']:
            prev_cursor_x.append(cursor_x)
            prev_cursor_y.append(cursor_y)
            _ = prev_cursor_x.pop(0)
            _ = prev_cursor_y.pop(0)
        prev_cursor = np.column_stack((prev_cursor_x, prev_cursor_y))
        mean = np.mean(prev_cursor, 0)
        var_dist = np.var(np.sum(((prev_cursor - mean) / screen_size)**2, 1))
        if var_dist > config_dict['std_threshold']:
            pyag.moveTo(cursor_x, cursor_y)
        else:
            pyag.moveTo(int(mean[0]), int(mean[1]))
        skeleton_overlay = draw_2d_skeleton(frame, est_pose_uv)
        cv2.imshow('result', skeleton_overlay)
        #name=str(i)+'.jpg'
        #cv2.imwrite(osp.join(path , name), frame)
        i = i + 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            end = time.time()
            break


# When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
    print('no.of frames ={}'.format(i))
    print('FPS = {}'.format(i / (end - start)))
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/train_FreiHAND_dataset.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # Load data
    dataset_val = build_dataset(cfg.TRAIN.DATASET, cfg.TRAIN.BACKGROUND_SET,
                                cfg.TRAIN.DATA_SIZE)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        shuffle=True,
        num_workers=cfg.MODEL.NUM_WORKERS)

    # Load network model
    model = MLPPoseNetwork(cfg)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)
    model = model.train()

    # fix hm model weight
    for param in model.net_hm.parameters():
        param.requires_grad = False

    optimizer = optim.RMSprop(model.mlp.parameters(), lr=0.0001)
    #optimizer = optim.Adam(model.mlp.parameters(), lr=0.00001)
    pose_loss = nn.MSELoss(reduction='sum')

    num_epoch = 400
    for epoch in range(num_epoch):

        # reduce learning rate every 50 epoch
        if epoch % 50 == 0 and epoch != 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] /= 10

        tic = time.time()
        total_loss_train = 0.0
        tot_loss_len = 0.0
        tot_loss_dir = 0.0

        # model training per batcg
        for i, batch in enumerate(data_loader_val):
            images, cam_params, pose_roots, pose_scales, image_ids = batch
            images, cam_params, pose_roots, pose_scales = \
                images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device)

            # ground truth pose
            gt_pose_cam_xyz = torch.Tensor().to(device)
            for img_id in image_ids:
                gt_pose_cam_xyz = torch.cat(
                    (gt_pose_cam_xyz, dataset_val.pose_gts[img_id].to(device)),
                    0)
            gt_pose_cam_xyz = gt_pose_cam_xyz.view(-1, 21, 3)

            # forward propagation
            optimizer.zero_grad()
            _, est_pose_uv, est_pose_cam_xyz = model(images,
                                                     cam_params,
                                                     pose_scales,
                                                     pose_root=pose_roots)

            # bone constraint loss
            len_loss, dir_loss = bone_constraint_loss(est_pose_cam_xyz,
                                                      gt_pose_cam_xyz, device)
            loss = pose_loss(est_pose_cam_xyz,
                             gt_pose_cam_xyz) + len_loss + dir_loss

            # back propagation
            loss.backward()
            optimizer.step()
            total_loss_train += loss.item()
            tot_loss_len += len_loss.item()
            tot_loss_dir += dir_loss.item()

        # record time
        toc = time.time()
        print(
            'loss of epoch %2d: %6.2f, L_len: %3.2f, L_dir: %5.2f, time: %0.4f s'
            % (int(epoch + 1), total_loss_train, tot_loss_len, tot_loss_dir,
               toc - tic))

        # save model weight every epoch
        torch.save(model.mlp.state_dict(), "mlp.pth")
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file)
    mkdir(output_dir)
    logger = setup_logger("hand_shape_pose_inference",
                          output_dir,
                          filename='eval-' + get_logger_filename())
    logger.info(cfg)

    # 1. Load network model
    model = ShapePoseNetwork(cfg, output_dir)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)
    faces = model.hand_tri.astype('uint32')
    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 2. Load data
    dataset_val = build_dataset(cfg.EVAL.DATASET)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        num_workers=cfg.MODEL.NUM_WORKERS)
    #try adding skeleton detection
    import os, sys
    lib_path = os.path.abspath(os.path.join('..', 'HandKeyPointDetector'))
    sys.path.append(lib_path)
    from HandKeypointDetector import HandKeypointDetector
    hd = HandKeypointDetector('\output')
    count = 0

    # 3. Inference
    model.eval()
    results_pose_cam_xyz = {}
    cpu_device = torch.device("cpu")
    logger.info("Evaluate on {} frames:".format(len(dataset_val)))
    for i, batch in enumerate(data_loader_val):
        images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch
        new_images = torch.zeros([images.shape[0], 256, 256, images.shape[3]],
                                 dtype=images.dtype)
        import time
        cur = time.time()
        if images[0].shape[0] > 256 and images[0].shape[1] > 256:

            for j in range(len(images)):
                bb = hd.detectKeyPoints(images[j])
                Ydif = bb['maxY'] - bb['minY']
                Xdif = bb['maxX'] - bb['minX']
                if (Xdif > Ydif):
                    bb['maxY'] += abs(Ydif - Xdif) // 2
                    bb['minY'] -= abs(Ydif - Xdif) // 2

                else:
                    bb['maxX'] += abs(Ydif - Xdif) // 2
                    bb['minX'] -= abs(Ydif - Xdif) // 2
                t_img = images[j][bb['minY']:bb['maxY'],
                                  bb['minX']:bb['maxX'], :]
                # import matplotlib.pyplot as plt
                # plt.close('all')
                # plt.imshow(t_img.detach().cpu().numpy())
                # plt.savefig('a.png')
                import torch.nn.functional as F
                downsampled = F.upsample(t_img.float().unsqueeze(0).permute(
                    0, 3, 1, 2),
                                         size=(256, 256),
                                         mode='bilinear').permute(0, 2, 3,
                                                                  1).byte()
                new_images[j] = downsampled[0]
                import matplotlib.pyplot as plt

                plt.imshow(downsampled.squeeze().detach().cpu().numpy())
                plt.savefig('output\{}.png'.format(j))

            images = new_images

        images, cam_params, bboxes, pose_roots, pose_scales = \
            images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device)
        with torch.no_grad():

            est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \
                model(images, cam_params, bboxes, pose_roots, pose_scales)
            elpased = (time.time() - cur) / data_loader_val.batch_size
            print('average run time per frame = {:.03f}sec'.format(elpased))
            est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz]
            est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
            est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]

        results_pose_cam_xyz.update({
            img_id.item(): result
            for img_id, result in zip(image_ids, est_pose_cam_xyz)
        })

        if i % cfg.EVAL.PRINT_FREQ == 0:
            # 4. evaluate pose estimation
            avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                                      save_results=False)  # cm
            msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format(
                len(results_pose_cam_xyz), len(dataset_val),
                avg_est_error * 10.0)
            logger.info(msg)

            # 5. visualize mesh and pose estimation
            if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i)
                logger.info("Saving image: {}".format(file_name))
                # test
                import numpy as np
                import trimesh

                # attach to logger so trimesh messages will be printed to console
                trimesh.util.attach_to_log()
                for j in range(len(images)):

                    # mesh objects can be created from existing faces and vertex data
                    ind = j
                    import matplotlib.pyplot as plt
                    plt.close('all')
                    plt.imshow(images[ind].detach().cpu().numpy()[..., ::-1])
                    plt.savefig('a.png')
                    mesh = trimesh.Trimesh(vertices=est_mesh_cam_xyz[ind],
                                           faces=faces)
                    mesh.show()
                # end test
                # save_batch_image_with_mesh_joints(mesh_renderer, images.to(cpu_device), cam_params.to(cpu_device),
                #                                   bboxes.to(cpu_device), est_mesh_cam_xyz, est_pose_uv,
                #                                   est_pose_cam_xyz, file_name)

    # overall evaluate pose estimation
    assert len(results_pose_cam_xyz) == len(dataset_val), \
        "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \
        (len(results_pose_cam_xyz), len(dataset_val))

    avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                              cfg.EVAL.SAVE_POSE_ESTIMATION,
                                              output_dir)  # cm
    logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format(
        avg_est_error * 10.0))
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/eval_FreiHAND_dataset.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file)
    mkdir(output_dir)
    logger = setup_logger("hand_shape_pose_inference",
                          output_dir,
                          filename='eval-' + get_logger_filename())
    logger.info(cfg)

    # Load network model
    model = MLPPoseNetwork(cfg)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg, load_mlp=True)

    # Load data
    dataset_val = build_dataset(cfg.EVAL.DATASET)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        num_workers=cfg.MODEL.NUM_WORKERS)

    # Inference
    model = model.eval()
    results_pose_cam_xyz = {}
    cpu_device = torch.device("cpu")
    logger.info("Evaluate on {} frames:".format(len(dataset_val)))
    for i, batch in enumerate(data_loader_val):
        if cfg.EVAL.DATASET == "FreiHAND_train":
            images, cam_params, pose_roots, pose_scales, image_ids = batch
            images, cam_params, pose_roots, pose_scales = \
                images.to(device), cam_params.to(device), pose_roots.to(device), pose_scales.to(device)
            with torch.no_grad():
                _, est_pose_uv, est_pose_cam_xyz = model(images,
                                                         cam_params,
                                                         pose_scales,
                                                         pose_root=pose_roots)
                est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
                est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]
        elif cfg.EVAL.DATASET == "FreiHAND_test":
            images, cam_params, pose_scales, image_ids = batch
            images, cam_params, pose_scales = \
                images.to(device), cam_params.to(device), pose_scales.to(device)
            with torch.no_grad():
                _, est_pose_uv, est_pose_cam_xyz = model(
                    images, cam_params, pose_scales)
                est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
                est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]
        else:
            images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch
            images, cam_params, bboxes, pose_roots, pose_scales = \
                images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device)
            with torch.no_grad():
                _, est_pose_uv, est_pose_cam_xyz = model(images,
                                                         cam_params,
                                                         pose_scales,
                                                         pose_root=pose_roots,
                                                         bbox=bboxes)
                est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
                est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]

        results_pose_cam_xyz.update({
            img_id.item(): result
            for img_id, result in zip(image_ids, est_pose_cam_xyz)
        })

        if i % cfg.EVAL.PRINT_FREQ == 0:
            # evaluate pose estimation
            if cfg.EVAL.DATASET != "FreiHAND_test":
                avg_est_error = dataset_val.evaluate_pose(
                    results_pose_cam_xyz, save_results=False)  # cm
                msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format(
                    len(results_pose_cam_xyz), len(dataset_val),
                    avg_est_error * 1000.0)
                logger.info(msg)

                # visualize mesh and pose estimation
                if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                    file_name = '{}_{}.jpg'.format(
                        osp.join(output_dir, 'pred'), i)
                    logger.info("Saving image: {}".format(file_name))
                    save_batch_image_with_mesh_joints(images.to(cpu_device),
                                                      est_pose_uv,
                                                      est_pose_cam_xyz,
                                                      file_name)
            else:
                if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                    file_name = '{}_{}.jpg'.format(
                        osp.join(output_dir, 'pred'), i)
                    logger.info("Saving image: {}".format(file_name))
                    save_batch_image_with_mesh_joints(images.to(cpu_device),
                                                      est_pose_uv,
                                                      est_pose_cam_xyz,
                                                      file_name)

    # overall evaluate pose estimation
    assert len(results_pose_cam_xyz) == len(dataset_val), \
        "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \
        (len(results_pose_cam_xyz), len(dataset_val))
    if cfg.EVAL.DATASET != "FreiHAND_test":
        avg_est_error = dataset_val.evaluate_pose(
            results_pose_cam_xyz, cfg.EVAL.SAVE_POSE_ESTIMATION,
            output_dir)  # cm
        logger.info(
            "Overall:\tAverage pose estimation error: {0:.2f} (mm)".format(
                avg_est_error * 1000.0))

        threshold_list = 0.0001 * np.array(range(0, 1000, 25))
        pck_list = dataset_val.evaluate_3d_pck(results_pose_cam_xyz,
                                               threshold_list)
        np.savetxt('pck_proposed.npy', pck_list)
        np.savetxt('threshold.npy', threshold_list)

        threshold_list = [i * 1000 for i in threshold_list]
        plt.figure()
        plt.plot(threshold_list, pck_list, label='proposed method')
        plt.title('Real world testset')
        plt.xlabel('error threshold (mm)')
        plt.ylabel('3D PCK')
        plt.legend(loc='lower right')
        plt.savefig('3D_PCK_proposed.png')