Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # 1. Load network model
    path = 'webcam_output'
    mkdir(path)
    model = ShapePoseNetwork(cfg, path)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 3. Inference
    model.eval()
    cpu_device = torch.device("cpu")
    cap = cv2.VideoCapture(0)
    i = 0
    start = time.time()
    end = 0
    while (True):
        ret, images = cap.read()

        images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA)
        images = torch.tensor(images)
        images = torch.unsqueeze(images, 0)
        images = images.to(device)
        with torch.no_grad():
            est_pose_uv = model(images)
            est_pose_uv = est_pose_uv.to(cpu_device)
        images = images.to(cpu_device)
        images = torch.squeeze(images, 0)
        est_pose_uv = np.asarray(est_pose_uv)
        images = images.numpy()
        est_pose_uv = est_pose_uv[0]
        skeleton_overlay = draw_2d_skeleton(images, est_pose_uv)
        cv2.imshow('result', skeleton_overlay)
        name = str(i) + '.jpg'
        cv2.imwrite(osp.join(path, name), skeleton_overlay)
        i = i + 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            end = time.time()
            break


# When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
    print('FPS = {}'.format(i / (end - start)))
Ejemplo n.º 2
0
def main():
    pyag.FAILSAFE = False

    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # 1. Load network model
    path = 'webcam_output_bbox'
    mkdir(path)
    model = ShapePoseNetwork(cfg, path)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    #2. Load hand_detector
    hand = test.hand_detector()

    # 3. Inference
    model.eval()
    cpu_device = torch.device("cpu")
    cap = cv2.VideoCapture(0)
    i = 0
    start = time.time()
    end = 0
    config_dict = {
        'y_up': 1.0,
        'y_low': 0.5,
        'x_up': 0.8,
        'x_low': 0.2,
        'std_threshold': 2e6,
        'buffer_size': 10
    }
    """
	y_up and y_low are the height thresholds within which the keypoints are mapped to the screen.
	ie., keypoints are mapped from a space which is (y_max-y_low)*img_height. If keypoints are outside this region cursor position does not change.
	std_threshold is the threshold for std deviation. If std_deviation is above the threshold (implies hand is moving),
	the cursor position is the location as detected in the previous image. If stddeviation is below the threshold (hand is static),
	the cursor position is the moving average of buffer_size previous images.
	"""
    prev_cursor_x = []
    prev_cursor_y = []
    prev_cursor = None
    screen_size = np.asarray(pyag.size())
    while (True):
        ret, frame = cap.read()
        img_h, img_w, _ = frame.shape
        bbox = hand.detect_hand(frame)
        bbox = bbox.astype(int)
        if bbox.size == 0:
            continue  #If no bbox detected skip keypoint detection
        images = frame[bbox[0][1]:bbox[0][3], bbox[0][0]:bbox[0][2]]
        bb_h, bb_w, _ = images.shape
        images = cv2.resize(images, (256, 256), interpolation=cv2.INTER_AREA)
        images = torch.tensor(images)
        images = torch.unsqueeze(images, 0)
        images = images.to(device)
        with torch.no_grad():
            est_pose_uv = model(images)
            est_pose_uv = est_pose_uv.to(cpu_device)
        est_pose_uv = np.asarray(est_pose_uv)
        est_pose_uv = est_pose_uv[0]
        est_pose_uv[:, 0] = est_pose_uv[:, 0] * bb_w / 256
        est_pose_uv[:, 1] = est_pose_uv[:, 1] * bb_h / 256

        est_pose_uv[:, 0] += bbox[0][0]
        est_pose_uv[:, 1] += bbox[0][1]
        if ((est_pose_uv[0, 1] > (img_h * config_dict['y_up'])) or
            (est_pose_uv[0, 1] < (img_h * config_dict['y_low']))) or (
                (est_pose_uv[0, 0] > (img_w * config_dict['x_up'])) or
                (est_pose_uv[0, 0] < (img_w * config_dict['x_low']))):
            continue
        cursor_x = int(
            (est_pose_uv[0, 0] - (img_w * config_dict['x_low'])) * 1920. /
            (img_w * (config_dict['x_up'] - config_dict['x_low'])))
        cursor_y = int(
            (est_pose_uv[0, 1] - (img_h * config_dict['y_low'])) * 1080. /
            (img_h * (config_dict['y_up'] - config_dict['y_low'])))
        if len(prev_cursor_x) <= config_dict['buffer_size']:
            prev_cursor_x.append(cursor_x)
            prev_cursor_y.append(cursor_y)
        elif len(prev_cursor_x) > config_dict['buffer_size']:
            prev_cursor_x.append(cursor_x)
            prev_cursor_y.append(cursor_y)
            _ = prev_cursor_x.pop(0)
            _ = prev_cursor_y.pop(0)
        prev_cursor = np.column_stack((prev_cursor_x, prev_cursor_y))
        mean = np.mean(prev_cursor, 0)
        var_dist = np.var(np.sum(((prev_cursor - mean) / screen_size)**2, 1))
        if var_dist > config_dict['std_threshold']:
            pyag.moveTo(cursor_x, cursor_y)
        else:
            pyag.moveTo(int(mean[0]), int(mean[1]))
        skeleton_overlay = draw_2d_skeleton(frame, est_pose_uv)
        cv2.imshow('result', skeleton_overlay)
        #name=str(i)+'.jpg'
        #cv2.imwrite(osp.join(path , name), frame)
        i = i + 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            end = time.time()
            break


# When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
    print('no.of frames ={}'.format(i))
    print('FPS = {}'.format(i / (end - start)))
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file)
    mkdir(output_dir)
    logger = setup_logger("hand_shape_pose_inference",
                          output_dir,
                          filename='eval-' + get_logger_filename())
    logger.info(cfg)

    # 1. Load network model
    model = ShapePoseNetwork(cfg, output_dir)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)

    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 2. Load data
    dataset_val = build_dataset(cfg.EVAL.DATASET)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        num_workers=cfg.MODEL.NUM_WORKERS)

    # 3. Inference
    model.eval()
    results_pose_cam_xyz = {}
    cpu_device = torch.device("cuda:0")
    logger.info("Evaluate on {} frames:".format(len(dataset_val)))
    for i, batch in enumerate(data_loader_val):
        images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch
        images, cam_params, bboxes, pose_roots, pose_scales = \
            images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device)
        with torch.no_grad():
            est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \
                model(images, cam_params, bboxes, pose_roots, pose_scales)

            est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz]
            est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
            est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]

        results_pose_cam_xyz.update({
            img_id.item(): result
            for img_id, result in zip(image_ids, est_pose_cam_xyz)
        })

        if i % cfg.EVAL.PRINT_FREQ == 0:
            # 4. evaluate pose estimation
            avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                                      save_results=False)  # cm
            msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format(
                len(results_pose_cam_xyz), len(dataset_val),
                avg_est_error * 10.0)
            logger.info(msg)

            # 5. visualize mesh and pose estimation
            if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i)
                logger.info("Saving image: {}".format(file_name))
                save_batch_image_with_mesh_joints(mesh_renderer,
                                                  images.to(cpu_device),
                                                  cam_params.to(cpu_device),
                                                  bboxes.to(cpu_device),
                                                  est_mesh_cam_xyz,
                                                  est_pose_uv,
                                                  est_pose_cam_xyz, file_name)

    # overall evaluate pose estimation
    assert len(results_pose_cam_xyz) == len(dataset_val), \
        "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \
        (len(results_pose_cam_xyz), len(dataset_val))

    avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                              cfg.EVAL.SAVE_POSE_ESTIMATION,
                                              output_dir)  # cm
    logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format(
        avg_est_error * 10.0))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description="3D Hand Shape and Pose Inference")
    parser.add_argument(
        "--config-file",
        default="configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = osp.join(cfg.EVAL.SAVE_DIR, args.config_file)
    mkdir(output_dir)
    logger = setup_logger("hand_shape_pose_inference",
                          output_dir,
                          filename='eval-' + get_logger_filename())
    logger.info(cfg)

    # 1. Load network model
    model = ShapePoseNetwork(cfg, output_dir)
    device = cfg.MODEL.DEVICE
    model.to(device)
    model.load_model(cfg)
    faces = model.hand_tri.astype('uint32')
    mesh_renderer = renderer.MeshRenderer(model.hand_tri.astype('uint32'))

    # 2. Load data
    dataset_val = build_dataset(cfg.EVAL.DATASET)
    data_loader_val = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=cfg.MODEL.BATCH_SIZE,
        num_workers=cfg.MODEL.NUM_WORKERS)
    #try adding skeleton detection
    import os, sys
    lib_path = os.path.abspath(os.path.join('..', 'HandKeyPointDetector'))
    sys.path.append(lib_path)
    from HandKeypointDetector import HandKeypointDetector
    hd = HandKeypointDetector('\output')
    count = 0

    # 3. Inference
    model.eval()
    results_pose_cam_xyz = {}
    cpu_device = torch.device("cpu")
    logger.info("Evaluate on {} frames:".format(len(dataset_val)))
    for i, batch in enumerate(data_loader_val):
        images, cam_params, bboxes, pose_roots, pose_scales, image_ids = batch
        new_images = torch.zeros([images.shape[0], 256, 256, images.shape[3]],
                                 dtype=images.dtype)
        import time
        cur = time.time()
        if images[0].shape[0] > 256 and images[0].shape[1] > 256:

            for j in range(len(images)):
                bb = hd.detectKeyPoints(images[j])
                Ydif = bb['maxY'] - bb['minY']
                Xdif = bb['maxX'] - bb['minX']
                if (Xdif > Ydif):
                    bb['maxY'] += abs(Ydif - Xdif) // 2
                    bb['minY'] -= abs(Ydif - Xdif) // 2

                else:
                    bb['maxX'] += abs(Ydif - Xdif) // 2
                    bb['minX'] -= abs(Ydif - Xdif) // 2
                t_img = images[j][bb['minY']:bb['maxY'],
                                  bb['minX']:bb['maxX'], :]
                # import matplotlib.pyplot as plt
                # plt.close('all')
                # plt.imshow(t_img.detach().cpu().numpy())
                # plt.savefig('a.png')
                import torch.nn.functional as F
                downsampled = F.upsample(t_img.float().unsqueeze(0).permute(
                    0, 3, 1, 2),
                                         size=(256, 256),
                                         mode='bilinear').permute(0, 2, 3,
                                                                  1).byte()
                new_images[j] = downsampled[0]
                import matplotlib.pyplot as plt

                plt.imshow(downsampled.squeeze().detach().cpu().numpy())
                plt.savefig('output\{}.png'.format(j))

            images = new_images

        images, cam_params, bboxes, pose_roots, pose_scales = \
            images.to(device), cam_params.to(device), bboxes.to(device), pose_roots.to(device), pose_scales.to(device)
        with torch.no_grad():

            est_mesh_cam_xyz, est_pose_uv, est_pose_cam_xyz = \
                model(images, cam_params, bboxes, pose_roots, pose_scales)
            elpased = (time.time() - cur) / data_loader_val.batch_size
            print('average run time per frame = {:.03f}sec'.format(elpased))
            est_mesh_cam_xyz = [o.to(cpu_device) for o in est_mesh_cam_xyz]
            est_pose_uv = [o.to(cpu_device) for o in est_pose_uv]
            est_pose_cam_xyz = [o.to(cpu_device) for o in est_pose_cam_xyz]

        results_pose_cam_xyz.update({
            img_id.item(): result
            for img_id, result in zip(image_ids, est_pose_cam_xyz)
        })

        if i % cfg.EVAL.PRINT_FREQ == 0:
            # 4. evaluate pose estimation
            avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                                      save_results=False)  # cm
            msg = 'Evaluate: [{0}/{1}]\t' 'Average pose estimation error: {2:.2f} (mm)'.format(
                len(results_pose_cam_xyz), len(dataset_val),
                avg_est_error * 10.0)
            logger.info(msg)

            # 5. visualize mesh and pose estimation
            if cfg.EVAL.SAVE_BATCH_IMAGES_PRED:
                file_name = '{}_{}.jpg'.format(osp.join(output_dir, 'pred'), i)
                logger.info("Saving image: {}".format(file_name))
                # test
                import numpy as np
                import trimesh

                # attach to logger so trimesh messages will be printed to console
                trimesh.util.attach_to_log()
                for j in range(len(images)):

                    # mesh objects can be created from existing faces and vertex data
                    ind = j
                    import matplotlib.pyplot as plt
                    plt.close('all')
                    plt.imshow(images[ind].detach().cpu().numpy()[..., ::-1])
                    plt.savefig('a.png')
                    mesh = trimesh.Trimesh(vertices=est_mesh_cam_xyz[ind],
                                           faces=faces)
                    mesh.show()
                # end test
                # save_batch_image_with_mesh_joints(mesh_renderer, images.to(cpu_device), cam_params.to(cpu_device),
                #                                   bboxes.to(cpu_device), est_mesh_cam_xyz, est_pose_uv,
                #                                   est_pose_cam_xyz, file_name)

    # overall evaluate pose estimation
    assert len(results_pose_cam_xyz) == len(dataset_val), \
        "The number of estimation results (%d) is inconsistent with that of the ground truth (%d)." % \
        (len(results_pose_cam_xyz), len(dataset_val))

    avg_est_error = dataset_val.evaluate_pose(results_pose_cam_xyz,
                                              cfg.EVAL.SAVE_POSE_ESTIMATION,
                                              output_dir)  # cm
    logger.info("Overall:\tAverage pose estimation error: {0:.2f} (mm)".format(
        avg_est_error * 10.0))