Beispiel #1
0
model = DataParallel(model).cuda()
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['network'], strict=False)
model.eval()

# prepare input image
transform = transforms.ToTensor()
img_path = 'input.jpg'
original_img = cv2.imread(img_path)
original_img_height, original_img_width = original_img.shape[:2]

# prepare bbox
bbox = [139.41, 102.25, 222.39, 241.57] # xmin, ymin, width, height
bbox = process_bbox(bbox, original_img_width, original_img_height)
img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, cfg.input_img_shape) 
img = transform(img.astype(np.float32))/255
img = img.cuda()[None,:,:,:]

# forward
inputs = {'img': img}
targets = {}
meta_info = {'bb2img_trans': bb2img_trans}
with torch.no_grad():
    out = model(inputs, targets, meta_info, 'test')
img = img[0].cpu().numpy().transpose(1,2,0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3
mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy()
mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy()

# restore mesh_lixel_img to original image space and continuous depth space
mesh_lixel_img[:,0] = mesh_lixel_img[:,0] / cfg.output_hm_shape[2] * cfg.input_img_shape[1]
Beispiel #2
0
def main():

    # input_size=416
    # iou_threshold=0.45
    # score_threshold=0.3

    # Yolo = Load_Yolo_model()
    times = []
    output_path = "output"
    vid = cv2.VideoCapture(0)
    vid.set(3, 1280)
    vid.set(4, 1024)
    # by default VideoCapture returns float instead of int
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))

    focal = (1500, 1500)
    princpt = (width / 2, height / 2)

    print(f"Width {width} Height {height}")

    bbox = [0, 0, width, height]
    bbox = process_bbox(bbox, width, height)

    root_depth = 11250.5732421875  # obtain this from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE/tree/master/demo)
    root_depth /= 1000  # output of RootNet is milimeter. change it to meter
    with torch.no_grad():

        while True:
            _, frame = vid.read()

            t1 = time.time()
            try:
                original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                original_frame = cv2.cvtColor(original_frame,
                                              cv2.COLOR_BGR2RGB)
            except:
                break
            # image_data = image_preprocess(np.copy(original_frame), [input_size, input_size])
            # image_data = image_data[np.newaxis, ...].astype(np.float32)

            # if YOLO_FRAMEWORK == "tf":
            #     pred_bbox = Yolo.predict(image_data)
            # elif YOLO_FRAMEWORK == "trt":
            #     batched_input = tf.constant(image_data)
            #     result = Yolo(batched_input)
            #     pred_bbox = []
            #     for key, value in result.items():
            #         value = value.numpy()
            #         pred_bbox.append(value)

            # pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox]
            # pred_bbox = tf.concat(pred_bbox, axis=0)

            # bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold)
            # bboxes = nms(bboxes, iou_threshold, method='nms')
            # frame = draw_bbox(original_frame, bboxes)
            #----------------------------------------i2l meshnet---------------------------------------

            # original_img_height, original_img_width = original_frame.shape[:2]

            # bbox = bboxes[0][:4]

            img, img2bb_trans, bb2img_trans = generate_patch_image(
                original_frame, bbox, 1.0, 0.0, False, cfg.input_img_shape)
            img = transform(img.astype(np.float32)) / 255
            img = img.cuda()[None, :, :, :]

            # forward
            inputs = {'img': img}
            targets = {}
            meta_info = {'bb2img_trans': bb2img_trans}
            out = model(inputs, targets, meta_info, 'test')
            img = img[0].cpu().numpy().transpose(
                1, 2, 0)  # cfg.input_img_shape[1], cfg.input_img_shape[0], 3
            mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy()
            mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy()

            # restore mesh_lixel_img to original image space and continuous depth space
            mesh_lixel_img[:, 0] = mesh_lixel_img[:, 0] / cfg.output_hm_shape[
                2] * cfg.input_img_shape[1]
            mesh_lixel_img[:, 1] = mesh_lixel_img[:, 1] / cfg.output_hm_shape[
                1] * cfg.input_img_shape[0]
            mesh_lixel_img[:, :2] = np.dot(
                bb2img_trans,
                np.concatenate((mesh_lixel_img[:, :2],
                                np.ones_like(mesh_lixel_img[:, :1])),
                               1).transpose(1, 0)).transpose(1, 0)
            mesh_lixel_img[:, 2] = (
                mesh_lixel_img[:, 2] / cfg.output_hm_shape[0] * 2. -
                1) * (cfg.bbox_3d_size / 2)

            # root-relative 3D coordinates -> absolute 3D coordinates

            root_xy = np.dot(joint_regressor,
                             mesh_lixel_img)[root_joint_idx, :2]

            root_img = np.array([root_xy[0], root_xy[1], root_depth])
            root_cam = pixel2cam(root_img[None, :], focal, princpt)
            mesh_lixel_img[:, 2] += root_depth
            mesh_lixel_cam = pixel2cam(mesh_lixel_img, focal, princpt)
            mesh_param_cam += root_cam.reshape(1, 3)

            # visualize lixel mesh in 2D space
            # vis_img = frame.copy()
            # vis_img = vis_mesh(vis_img, mesh_lixel_img)
            # cv2.imwrite('output_mesh_lixel.jpg', vis_img)

            # visualize lixel mesh in 2D space
            # vis_img = frame.copy()
            # mesh_param_img = cam2pixel(mesh_param_cam, focal, princpt)
            # vis_img = vis_mesh(vis_img, mesh_param_img)
            # cv2.imwrite('output_mesh_param.jpg', vis_img)

            # save mesh (obj)
            # save_obj(mesh_lixel_cam, face, 'output_mesh_lixel.obj')
            # save_obj(mesh_param_cam, face, 'output_mesh_param.obj')

            # render mesh from lixel
            vis_img = frame.copy()
            rendered_img = render_mesh(vis_img, mesh_lixel_cam, face, {
                'focal': focal,
                'princpt': princpt
            })
            # cv2.imwrite('rendered_mesh_lixel.jpg', rendered_img)
            cv2.imshow('output', rendered_img / 255)
            if cv2.waitKey(25) & 0xFF == ord("q"):
                cv2.destroyAllWindows()
                break

            # render mesh from param
            # vis_img = frame.copy()
            # rendered_img = render_mesh(vis_img, mesh_param_cam, face, {'focal': focal, 'princpt': princpt})
            # cv2.imwrite('rendered_mesh_param.jpg', rendered_img)

            #----------------------------------------i2l meshnet---------------------------------------
            t2 = time.time()
            times.append(t2 - t1)
            times = times[-20:]

            ms = sum(times) / len(times) * 1000
            fps = 1000 / ms

            print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps))
Beispiel #3
0
model = DataParallel(model).cuda()
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['network'], strict=False)
model.eval()

# prepare input image
transform = transforms.ToTensor()
img_path = 'input.jpg'
original_img = cv2.imread(img_path)
original_img_height, original_img_width = original_img.shape[:2]

# prepare bbox
bbox = [69, 137, 165, 153]  # xmin, ymin, width, height
bbox = process_bbox(
    bbox, (original_img_height, original_img_width, original_img_height))
img, trans, inv_trans = generate_patch_image(original_img, bbox, False, 1.0,
                                             0.0, cfg.input_img_shape)
img = transform(img.astype(np.float32)) / 255
img = img.cuda()[None, :, :, :]

# forward
inputs = {'img': img}
targets = {}
meta_info = {}
with torch.no_grad():
    out = model(inputs, targets, meta_info, 'test')
img = img[0].cpu().numpy().transpose(
    1, 2, 0)  # cfg.input_img_shape[1], cfg.input_img_shape[0], 3
joint_coord = out['joint_coord'][0].cpu().numpy(
)  # x,y pixel, z root-relative discretized depth
rel_root_depth = out['rel_root_depth'][0].cpu().numpy()  # discretized depth
hand_type = out['hand_type'][0].cpu().numpy()  # handedness probability
    def __getitem__(self, idx):
        frame = self.framelist[idx]
        seq_name, cam, frame_idx, joint = frame['seq_name'], frame[
            'cam'], frame['frame_idx'], frame['joint']
        joint_coord, joint_valid = joint['world_coord'], joint['valid']

        # input data
        # bbox calculate
        bbox = get_bbox(joint_coord, joint_valid, self.camrot[cam],
                        self.campos[cam], self.focal[cam], self.princpt[cam])
        xmin, ymin, xmax, ymax = bbox
        xmin = max(xmin, 0)
        ymin = max(ymin, 0)
        xmax = min(xmax, self.original_img_shape[1] - 1)
        ymax = min(ymax, self.original_img_shape[0] - 1)
        bbox = np.array([xmin, ymin, xmax, ymax])

        # image read
        img_path = osp.join(self.root_path, seq_name, 'images', 'cam' + cam,
                            'image' + "{:04d}".format(frame_idx) + '.png')
        img = load_img(img_path)
        xmin, ymin, xmax, ymax = bbox
        xmin, xmax = np.array([xmin, xmax
                               ]) / self.original_img_shape[1] * img.shape[1]
        ymin, ymax = np.array([ymin, ymax
                               ]) / self.original_img_shape[0] * img.shape[0]
        bbox_img = np.array([xmin, ymin, xmax - xmin + 1, ymax - ymin + 1])
        img = generate_patch_image(img, bbox_img, False, 1.0, 0.0,
                                   cfg.input_img_shape)
        input_img = self.transform(img) / 255.

        target_depthmaps = []
        cam_params = []
        affine_transes = []
        for cam in random.sample(self.selected_cameras, cfg.render_view_num):
            # bbox calculate
            bbox = get_bbox(joint_coord, joint_valid, self.camrot[cam],
                            self.campos[cam], self.focal[cam],
                            self.princpt[cam])
            xmin, ymin, xmax, ymax = bbox
            xmin = max(xmin, 0)
            ymin = max(ymin, 0)
            xmax = min(xmax, self.original_img_shape[1] - 1)
            ymax = min(ymax, self.original_img_shape[0] - 1)
            bbox = np.array([xmin, ymin, xmax, ymax])

            # depthmap read
            depthmap_path = osp.join(self.depthmap_root_path,
                                     "{:06d}".format(frame_idx),
                                     'depthmap' + cam + '.pkl')
            with open(depthmap_path, 'rb') as f:
                depthmap = pickle.load(f).astype(np.float32)
            xmin, ymin, xmax, ymax = bbox
            xmin, xmax = np.array(
                [xmin, xmax]) / self.original_img_shape[1] * depthmap.shape[1]
            ymin, ymax = np.array(
                [ymin, ymax]) / self.original_img_shape[0] * depthmap.shape[0]
            bbox_depthmap = np.array(
                [xmin, ymin, xmax - xmin + 1, ymax - ymin + 1])
            depthmap = generate_patch_image(depthmap[:, :, None],
                                            bbox_depthmap, False, 1.0, 0.0,
                                            cfg.rendered_img_shape)
            target_depthmaps.append(self.transform(depthmap))

            xmin, ymin, xmax, ymax = bbox
            affine_transes.append(
                gen_trans_from_patch_cv(
                    (xmin + xmax + 1) / 2., (ymin + ymax + 1) / 2.,
                    xmax - xmin + 1, ymax - ymin + 1,
                    cfg.rendered_img_shape[1], cfg.rendered_img_shape[0], 1.0,
                    0.0).astype(np.float32))
            cam_params.append({
                'camrot': self.camrot[cam],
                'campos': self.campos[cam],
                'focal': self.focal[cam],
                'princpt': self.princpt[cam]
            })

        inputs = {'img': input_img}
        targets = {'depthmap': target_depthmaps, 'joint': joint}
        meta_info = {'cam_param': cam_params, 'affine_trans': affine_transes}

        return inputs, targets, meta_info