def prepare_input(imgpath, bbox):
    cvimg = cv2.imread(imgpath,
                       cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if not isinstance(cvimg, np.ndarray):
        raise IOError("Fail to read %s" % imgpath)
    img_height, img_width, img_channels = cvimg.shape

    scale, rot, do_flip, color_scale, do_occlusion = 1.0, 0.0, False, [
        1.0, 1.0, 1.0
    ], False

    # 3. crop patch from img and perform data augmentation (flip, rot, color scale, synthetic occlusion)
    # Adds black padding and ensured 255x255 dimensions for input even by introducing stretching.
    img_patch, trans = generate_patch_image(cvimg, bbox, do_flip, scale, rot,
                                            do_occlusion)

    for i in range(img_channels):
        img_patch[:, :, i] = np.clip(img_patch[:, :, i] * color_scale[i], 0,
                                     255)

    transform = transforms.Compose([ \
        transforms.ToTensor(),
        transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)] \
        )

    img_patch = transform(img_patch)

    input_img = img_patch.unsqueeze(0)

    return input_img
def get_input(image, person_boxes):
    person_images = np.zeros((len(person_boxes), 3, rootnet_cfg.input_shape[0], rootnet_cfg.input_shape[1]))
    k_values = np.zeros((len(person_boxes), 1))

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=rootnet_cfg.pixel_mean, std=rootnet_cfg.pixel_std)]
    )

    for i, box in enumerate(person_boxes):
        patch_image, _ = generate_patch_image(image, box, False, 0)
        person_images[i] = transform(patch_image)
        #k_value = np.array([math.sqrt(cfg.bbox_real[0]*cfg.bbox_real[1]*f[0]*f[1]/(area))]).astype(np.float32) in dataset.py
        k_values[i] = np.array(
            [math.sqrt(rootnet_cfg.bbox_real[0] * rootnet_cfg.bbox_real[1] * (image.shape[1]/2) * (image.shape[0]/2) / (box[3] * box[2]))]).astype(
            np.float32)

    person_images = torch.Tensor(person_images)
    k_values = torch.Tensor(k_values)

    return person_images, k_values
Esempio n. 3
0
def rootnet(img, bboxlist):
# prepare input image
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)])
    original_img = img
    original_img_height, original_img_width = original_img.shape[:2]

    bbox_list = bboxlist
    person_num = len(bbox_list)
    root_list = []
    # normalized camera intrinsics
    focal = [1500, 1500]  # x-axis, y-axis
    for n in range(person_num):
        bbox = process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height)
        img, img2bb_trans = generate_patch_image(original_img, bbox, False, 0.0)
        img = transform(img).cuda()[None, :, :, :]
        k_value = np.array(
            [math.sqrt(cfg.bbox_real[0] * cfg.bbox_real[1] * focal[0] * focal[1] / (bbox[2] * bbox[3]))]).astype(np.float32)
        k_value = torch.FloatTensor([k_value]).cuda()[None, :]

        # forward
        with torch.no_grad():
            root_3d = model(img, k_value)  # x,y: pixel, z: root-relative depth (mm)
        img = img[0].cpu().numpy()
        root_3d = root_3d[0].cpu().numpy()

        # save output in 2D space (x,y: pixel)
        vis_img = img.copy()
        vis_img = vis_img * np.array(cfg.pixel_std).reshape(3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1)
        vis_img = vis_img.astype(np.uint8)
        vis_img = vis_img[::-1, :, :]
        vis_img = np.transpose(vis_img, (1, 2, 0)).copy()
        vis_root = np.zeros((2))
        vis_root[0] = root_3d[0] / cfg.output_shape[1] * cfg.input_shape[1]
        vis_root[1] = root_3d[1] / cfg.output_shape[0] * cfg.input_shape[0]
        cv2.circle(vis_img, (int(vis_root[0]), int(vis_root[1])), radius=5, color=(0, 255, 0), thickness=-1,
                   lineType=cv2.LINE_AA)
        root_list.append(root_3d[2])
    return root_list
Esempio n. 4
0
assert len(bbox_list) == len(root_depth_list)
person_num = len(bbox_list)

# normalized camera intrinsics
focal = [1500, 1500]  # x-axis, y-axis
princpt = [original_img_width / 2, original_img_height / 2]  # x-axis, y-axis
print('focal length: (' + str(focal[0]) + ', ' + str(focal[1]) + ')')
print('principal points: (' + str(princpt[0]) + ', ' + str(princpt[1]) + ')')

# for each cropped and resized human image, forward it to PoseNet
output_pose_2d_list = []
output_pose_3d_list = []
for n in range(person_num):
    bbox = process_bbox(np.array(bbox_list[n]), original_img_width,
                        original_img_height)
    img, img2bb_trans = generate_patch_image(original_img, bbox, False, 1.0,
                                             0.0, False)
    print(img.shape, person_num)
    img = transform(img).cuda()[None, :, :, :]

    # forward
    with torch.no_grad():
        pose_3d = model(img)  # x,y: pixel, z: root-relative depth (mm)

    # inverse affine transform (restore the crop and resize)
    pose_3d = pose_3d[0].cpu().numpy()
    pose_3d[:, 0] = pose_3d[:, 0] / cfg.output_shape[1] * cfg.input_shape[1]
    pose_3d[:, 1] = pose_3d[:, 1] / cfg.output_shape[0] * cfg.input_shape[0]
    pose_3d_xy1 = np.concatenate(
        (pose_3d[:, :2], np.ones_like(pose_3d[:, :1])), 1)
    img2bb_trans_001 = np.concatenate(
        (img2bb_trans, np.array([0, 0, 1]).reshape(1, 3)))