def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
    check_file_existance(FILE_PATH)

    # prepare input data
    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    with open(FILE_PATH, 'r') as f:
        extrinsics = json.load(f)

    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    if args.video is None:
        frame_provider = ImageReader([args.input])
        is_video = False
    else:
        frame_provider = VideoReader(args.video)
        is_video = True

    fx = -1
    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    img_mean = np.array([128, 128, 128], dtype=np.float32)
    base_width_calculated = False

    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id)

    # inference
    for frame_id, frame in enumerate(frame_provider):
        current_time = cv2.getTickCount()
        if frame is None:
            break

        if not base_width_calculated:
            IMAGE_WIDTH = frame.shape[1] * (IMAGE_HEIGHT / frame.shape[0])
            IMAGE_WIDTH = int(IMAGE_WIDTH / STRIDE) * STRIDE
            net.set_input_shape((1, 3, IMAGE_HEIGHT, IMAGE_WIDTH))
            base_width_calculated = True

        input_scale = IMAGE_HEIGHT / frame.shape[0]
        scaled_img = cv2.resize(frame,
                                dsize=None,
                                fx=input_scale,
                                fy=input_scale)
        # better to pad, but cut out for demo
        scaled_img = scaled_img[:, 0:scaled_img.shape[1] -
                                (scaled_img.shape[1] % STRIDE)]

        if fx < 0:  # Focal length is unknown
            fx = np.float32(0.8 * frame.shape[1])

        normalized_img = (scaled_img.astype(np.float32) - img_mean) / 255.0
        normalized_img = np.expand_dims(normalized_img.transpose(2, 0, 1),
                                        axis=0)

        # exectution
        if is_video:
            input_blobs = net.get_input_blob_list()
            net.set_input_blob_data(normalized_img, input_blobs[0])
            net.update()
            features, heatmaps, pafs = net.get_results()

        else:
            print('Start inference...')
            if args.benchmark:
                print('BENCHMARK mode')
                for i in range(5):
                    start = int(round(time.time() * 1000))
                    features, heatmaps, pafs = net.predict([normalized_img])
                    end = int(round(time.time() * 1000))
                    print(f'\tailia processing time {end - start} ms')
            else:
                features, heatmaps, pafs = net.predict([normalized_img])

        inference_result = (features[-1].squeeze(), heatmaps[-1].squeeze(),
                            pafs[-1].squeeze())

        poses_3d, poses_2d = parse_poses(inference_result, input_scale, STRIDE,
                                         fx, is_video)
        edges = []
        if len(poses_3d):
            poses_3d = rotate_poses(poses_3d, R, t)
            poses_3d_copy = poses_3d.copy()
            x = poses_3d_copy[:, 0::4]
            y = poses_3d_copy[:, 1::4]
            z = poses_3d_copy[:, 2::4]
            poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y

            poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3]
            edges = (Plotter3d.SKELETON_EDGES +
                     19 * np.arange(poses_3d.shape[0]).reshape(
                         (-1, 1, 1))).reshape((-1, 2))
        plotter.plot(canvas_3d, poses_3d, edges)

        if is_video:
            cv2.imshow(canvas_3d_window_name, canvas_3d)
        else:
            cv2.imwrite(f'Canvas3D_{frame_id}.png', canvas_3d)

        draw_poses(frame, poses_2d)
        current_time = (cv2.getTickCount() -
                        current_time) / cv2.getTickFrequency()
        if mean_time == 0:
            mean_time = current_time
        else:
            mean_time = mean_time * 0.95 + current_time * 0.05
        cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10),
                    (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))

        if is_video:
            cv2.imshow('ICV 3D Human Pose Estimation', frame)
        else:
            cv2.imwrite(args.savepath, frame)

        key = cv2.waitKey(delay)
        if key == esc_code:
            break
        if key == p_code:
            if delay == 1:
                delay = 0
            else:
                delay = 1

        if delay == 0 and args.rotate3d:
            key = 0
            while (key != p_code and key != esc_code and key != space_code):
                plotter.plot(canvas_3d, poses_3d, edges)
                cv2.imshow(canvas_3d_window_name, canvas_3d)
                key = cv2.waitKey(33)
            if key == esc_code:
                break
            else:
                delay = 1

    print('Script finished successfully.')
    canvas_3d_window_name = 'Canvas 3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    file_path = args.extrinsics_path
    if file_path is None:
        file_path = os.path.join('data', 'extrinsics.json')
    with open(file_path, 'r') as f:
        extrinsics = json.load(f)
    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    frame_provider = ImageReader(args.images)
    is_video = False
    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
    base_height = args.height_size
    fx = args.fx

    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    for frame in frame_provider:
        current_time = cv2.getTickCount()
        if frame is None:
            break
        input_scale = base_height / frame.shape[0]
        scaled_img = cv2.resize(frame,
Ejemplo n.º 3
0
def run_inference(args):
    from modules.inference_engine_pytorch import InferenceEnginePyTorch

    socket_server = SocketServer(args.port)
    joint_angle_calculator = JointAngleCalculator()

    stride = 8

    model_path = os.path.join('models', 'human-pose-estimation-3d.pth')
    net = InferenceEnginePyTorch(model_path, "GPU")

    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas 3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    file_path = None
    if file_path is None:
        file_path = os.path.join('data', 'extrinsics.json')
    with open(file_path, 'r') as f:
        extrinsics = json.load(f)
    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    frame_provider = ImageReader(args.images)
    is_video = False
    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
    base_height = args.height_size
    fx = 1 # focal length

    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0

    for frame in frame_provider:
        current_time = cv2.getTickCount()
        if frame is None:
            break
        input_scale = base_height / frame.shape[0]
        scaled_img = cv2.resize(frame, dsize=None, fx=input_scale, fy=input_scale)
        scaled_img = scaled_img[:, 0:scaled_img.shape[1] - (scaled_img.shape[1] % stride)]  # better to pad, but cut out for demo
        if fx < 0:  # Focal length is unknown
            fx = np.float32(0.8 * frame.shape[1])

        inference_result = net.infer(scaled_img)
        poses_3d, poses_2d = parse_poses(inference_result, input_scale, stride, fx, is_video)
        edges = []

        if len(poses_3d):
            poses_3d = rotate_poses(poses_3d, R, t)
            poses_3d_copy = poses_3d.copy()
            x = poses_3d_copy[:, 0::4]
            y = poses_3d_copy[:, 1::4]
            z = poses_3d_copy[:, 2::4]
            poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y

            poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3]
            edges = (Plotter3d.SKELETON_EDGES + 19 * np.arange(poses_3d.shape[0]).reshape((-1, 1, 1))).reshape((-1, 2))

        plotter.plot(canvas_3d, poses_3d, edges)
        cv2.imshow(canvas_3d_window_name, canvas_3d)

        draw_poses(frame, poses_2d)
        current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency()
        if mean_time == 0:
            mean_time = current_time
        else:
            mean_time = mean_time * 0.95 + current_time * 0.05
        cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10),
                    (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))
        cv2.imshow('ICV 3D Human Pose Estimation', frame)

        key = cv2.waitKey(delay)
        if key == esc_code:
            break
        if key == p_code:
            if delay == 1:
                delay = 0
            else:
                delay = 1
        if delay == 0 or not is_video:  # allow to rotate 3D canvas while on pause
            key = 0
            while (key != p_code
                   and key != esc_code
                   and key != space_code):
                plotter.plot(canvas_3d, poses_3d, edges)
                cv2.imshow(canvas_3d_window_name, canvas_3d)
                key = cv2.waitKey(33)
            if key == esc_code:
                break
            else:
                delay = 1
        
        joint_angles = joint_angle_calculator.calculate_angles(poses_3d)
        if joint_angles:
            socket_server.send_data(joint_angles)
Ejemplo n.º 4
0
def createUI():
    global MODEL_COUNT
    usedModel = 0
    time_part = time.perf_counter()
    time_all = time.perf_counter()

    print("Starting UI... Press 'Esc' to exit")
    webcam_image = np.zeros((200, 150, 3), np.uint8)
    webcam_image_rgb = np.zeros((200, 150, 3),
                                np.uint8)  # BGR -> RGB for pytorch

    MODEL_COUNT = len(getModels())
    print("Found " + str(MODEL_COUNT) + " models to use")
    dataloader.loadModel(MODEL_FOLDER + getModels()[usedModel])

    # Create window and UI
    cv2.namedWindow(WINDOW_TITLE)
    cv2.createTrackbar('Model', WINDOW_TITLE, usedModel,
                       max(MODEL_COUNT - 1, 1), empty)
    cv2.createTrackbar('Height', WINDOW_TITLE, 256, 512, empty)
    cv2.createTrackbar('FX', WINDOW_TITLE, 0, 50, empty)
    cv2.createTrackbar('Screenshot', WINDOW_TITLE, 0, 1, empty)
    cv2.createTrackbar('Sync Draw', WINDOW_TITLE, 0, 1, empty)

    is_using_video_file = False
    if len(sys.argv) > 1:
        print("Using the provided video file:", sys.argv[1])
        video_reader = VideoReader(sys.argv[1])
        is_using_video_file = True
        video_iter = iter(video_reader)
    else:
        print("Connecting to Webcam (this may take a few seconds...)")
        cam = cv2.VideoCapture(0)  # Opens the default camera

    print("Running")

    while True:
        # Get image data
        time_part = time.perf_counter()

        if is_using_video_file:
            try:
                webcam_image = next(video_iter)
            except StopIteration as e:
                video_iter = iter(video_reader)
                webcam_image = next(video_iter)
        else:
            _, webcam_image = cam.read()

        webcam_image = cv2.flip(webcam_image, 1)  # Mirror
        drawCalcTime(webcam_image, time_part, "WEBCAM", 1)
        time_part = time.perf_counter()

        # Read variables
        height = cv2.getTrackbarPos('Height', WINDOW_TITLE)
        if height < 16:
            height = 16
            cv2.setTrackbarPos('Height', WINDOW_TITLE, 16)
        fx = cv2.getTrackbarPos('FX', WINDOW_TITLE)
        if fx == 0:
            fx = -1

        # Prepare Image
        image, input_scale, fx = dataloader.prepareImage(
            webcam_image, height, fx)
        drawCalcTime(webcam_image, time_part, "CV2", 2)
        time_part = time.perf_counter()

        # Get Poses
        pose_3d, pose_2d = dataloader.calcPoses(image, input_scale, fx)
        drawCalcTime(webcam_image, time_part, "POSE", 3)
        time_part = time.perf_counter()

        # Draw Poses
        for pid in range(len(pose_2d)):
            # all array elements
            # into ?x3 array
            # reverse dimensions
            pose = np.array(pose_2d[pid][0:-1]) \
                .reshape((-1, 3)) \
                .transpose()
            has_pose = pose[2, :] > 0
            for eid in range(len(body_edges)):  # Go through all defined edges
                edge = body_edges[eid]
                if has_pose[edge[0]] and has_pose[
                        edge[1]]:  # If we have both "points" -> Draw line
                    color = colorsys.hsv_to_rgb(
                        eid / 17.0, 1,
                        1)  # Use HSL color space to use different colors
                    color = [e * 256 for e in color
                             ]  # convert [0,1] to [0,256] for ocv
                    cv2.line(webcam_image, tuple(pose[0:2,
                                                      edge[0]].astype(int)),
                             tuple(pose[0:2, edge[1]].astype(int)), color, 4,
                             cv2.LINE_AA)

        sync = cv2.getTrackbarPos('Sync Draw', WINDOW_TITLE)
        chart.updateData(pose_3d, sync)

        cv2.putText(webcam_image, "Model: " + getModels()[usedModel], (10, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, .6, (192, 192, 192), 2)

        drawCalcTime(webcam_image, time_part, "DRAW", 4)
        time_part = time.perf_counter()
        drawCalcTime(webcam_image, time_all, "All", 5, True)
        time_all = time.perf_counter()

        # Draw to screen

        cv2.imshow(WINDOW_TITLE, webcam_image)

        # Model Change Event
        model = cv2.getTrackbarPos('Model', WINDOW_TITLE)
        if not usedModel == model:
            file = MODEL_FOLDER + getModels()[model]
            dataloader.loadModel(file)
            usedModel = model

        # Screenshot event
        # FIXME This event gets called twice but the position is reset inside the block
        if cv2.getTrackbarPos('Screenshot', WINDOW_TITLE) == 1:
            if not os.path.exists('output'):  # Create folder
                os.makedirs('output')
            dt = datetime.datetime.today().strftime('%Y%m%d-%H.%M.%S')
            cv2.imwrite('output/img_' + dt + '.png',
                        webcam_image)  # Save to folder
            cv2.setTrackbarPos('Screenshot', WINDOW_TITLE, 0)  # Reset Trackbar
            print("Screenshot saved")
            time.sleep(1)  # User can preview the saved frame

        # Exit
        if cv2.waitKey(1) == ESCAPE_KEY:
            exit(0)
            break  # esc to quit

    cv2.destroyAllWindows()
def main():
    parser = ArgumentParser(
        description='Lightweight 3D human pose estimation demo. '
        'Press esc to exit, "p" to (un)pause video or process next image.')
    parser.add_argument(
        '-m',
        '--model',
        help='Required. Path to checkpoint with a trained model '
        '(or an .xml file in case of OpenVINO inference).',
        type=str,
        required=True)
    parser.add_argument('--video',
                        help='Optional. Path to video file or camera id.',
                        type=str,
                        default='')
    parser.add_argument('-o',
                        '--output',
                        help='output directory for estimated results',
                        default='./output')

    parser.add_argument(
        '-d',
        '--device',
        help='Optional. Specify the target device to infer on: CPU or GPU. '
        'The demo will look for a suitable plugin for device specified '
        '(by default, it is GPU).',
        type=str,
        default='GPU')
    parser.add_argument(
        '--use-openvino',
        help='Optional. Run network with OpenVINO as inference engine. '
        'CPU, GPU, FPGA, HDDL or MYRIAD devices are supported.',
        action='store_true')
    parser.add_argument(
        '--use-tensorrt',
        help='Optional. Run network with TensorRT as inference engine.',
        action='store_true')
    parser.add_argument('--images',
                        help='Optional. Path to input image(s).',
                        nargs='+',
                        default='')
    parser.add_argument('--height-size',
                        help='Optional. Network input layer height size.',
                        type=int,
                        default=256)
    parser.add_argument('--extrinsics-path',
                        help='Optional. Path to file with camera extrinsics.',
                        type=str,
                        default=None)
    parser.add_argument('--fx',
                        type=np.float32,
                        default=-1,
                        help='Optional. Camera focal length.')
    args = parser.parse_args()

    if args.video == '' and args.images == '':
        raise ValueError('Either --video or --image has to be provided')

    infer_ctrl = InferCtrl(args.model,
                           args.height_size,
                           device=args.device,
                           openvino=args.use_openvino,
                           tensorrt=args.use_tensorrt,
                           extrinsics_path=args.extrinsics_path,
                           fx=args.fx)

    frame_provider = ImageReader(args.images)
    is_video = False

    outname = args.images

    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
        try:
            cam_index = int(args.video)
            outname = "output-cam-{}".format(cam_index)
        except:
            outname = ".".join(args.video.split(os.sep)[-1].split(".")[:-1])

    fx = args.fx

    mean_time = 0
    i = 0
    frame_size = None, None

    dir_name = os.path.join(args.output, outname)
    os.makedirs(dir_name, exist_ok=True)

    outname_3d = os.path.join(dir_name, "{}-temp-3D.mp4".format(outname))
    outname_frames = os.path.join(dir_name, "{}-temp.mp4".format(outname))
    outname_combined_frames = os.path.join(
        dir_name, "{}-combined-temp.mp4".format(outname))

    outname_3d_compressed = os.path.join(dir_name, "{}-3D.mp4".format(outname))
    outname_frames_compressed = os.path.join(dir_name,
                                             "{}.mp4".format(outname))
    outname_frames_combined_compressed = os.path.join(
        dir_name, "{}-combined.mp4".format(outname))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = None
    out_3d = None
    out_combined = None
    fps = None
    fps_out = 15

    try:
        results_json = []
        for frame in frame_provider:
            current_time = cv2.getTickCount()
            if frame is None:
                break

            inference_result = infer_ctrl.infer(frame, is_video=False, fx=fx)

            # print(inference_result)
            # continue
            # intepreting results
            frame, canvas_3d = infer_ctrl.process_frame(
                frame, inference_result)
            combined_frame = infer_ctrl.process_frame(frame,
                                                      inference_result,
                                                      merged=True)

            for key in inference_result.keys():
                value = inference_result[key].get("value", [])

                if "numpy" in str(type(value)):
                    inference_result[key]["value"] = getattr(
                        value, "tolist", lambda: value)()

            results_json.append(inference_result)

            current_time = (cv2.getTickCount() -
                            current_time) / cv2.getTickFrequency()

            if mean_time == 0:
                mean_time = current_time
            else:
                mean_time = mean_time * 0.95 + current_time * 0.05

            fps = int(1 / mean_time * 10) / 10

            cv2.putText(frame, 'processing FPS: {}'.format(fps), (40, 80),
                        cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))
            i += 1

            if out is None or out_3d is None:
                frame_size = tuple(int(i) for i in frame.shape[:2])
                frame_size = frame_size[::-1]

                frame_size_3d = tuple(int(i) for i in canvas_3d.shape)
                frame_size_3d = canvas_3d.shape[1::-1]

                frame_size_combined = tuple(
                    int(i) for i in combined_frame.shape)
                frame_size_combined = combined_frame.shape[1::-1]

                print(frame_size_3d, frame_size, frame_size_combined)

                out_3d = cv2.VideoWriter(outname_3d, fourcc, fps_out,
                                         frame_size_3d, True)
                out = cv2.VideoWriter(outname_frames, fourcc, fps_out,
                                      frame_size, True)
                out_combined = cv2.VideoWriter(outname_combined_frames, fourcc,
                                               fps_out, frame_size_combined,
                                               True)

            if out is not None:
                out.write(frame)

            if out_3d is not None:
                out_3d.write(canvas_3d)

            if out_combined is not None:
                out_combined.write(combined_frame)

    except KeyboardInterrupt:
        print("[INFO] interrupted")

    if out is not None:
        out.release()

    if out_3d is not None:
        out_3d.release()

    with open(os.path.join(dir_name, "results.json"), "w") as fp:
        json.dump(results_json, fp)
        fp.close()

    try:
        os.system(
            f"ffmpeg -i {outname_frames} -loglevel error -vcodec libx264 {outname_frames_compressed}"
        )
        os.system(
            f"ffmpeg -i {outname_3d} -loglevel error -vcodec libx264 {outname_3d_compressed}"
        )
        os.system(
            f"ffmpeg -i {outname_combined_frames} -loglevel error -vcodec libx264 {outname_frames_combined_compressed}"
        )

        # os.system(f"rm -rf {outname_frames} {outname_3d}")
    except:
        traceback.print_exc()

    print("[INFO] finished .... ")