def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.DEBUG,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("debug.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        start_time = time.time()
        fdnet.load_model()
        logging.info(
            f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms")

        start_time = time.time()
        lmnet.load_model()
        logging.info(
            f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        hpnet.load_model()
        logging.info(
            f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        genet.load_model()
        logging.info(
            f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()

        frame_count = 0

        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0

        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fd_output = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, bboxes = fdnet.preprocess_output(
                fd_output, frame, args.print)

            for bbox in bboxes:

                face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lm_output = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lm_output, bbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hp_output = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angles = hpnet.preprocess_output(
                    hp_output, out_frame, face, bbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                ge_output = genet.predict(left_eye, right_eye, headpose_angles)
                ge_infertime += time.time() - start_time
                out_frame, gaze_vector = genet.preprocess_output(
                    ge_output, out_frame, bbox, left_eye_point,
                    right_eye_point, args.print)

                if not args.no_video:
                    cv2.imshow('image', out_frame)

                if not args.no_move:
                    mc.move(gaze_vector[0], gaze_vector[1])

                break

            if key_pressed == 27:
                break

        if frame_count > 0:
            logging.info(
                f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms")
            logging.info(
                f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms"
            )
            logging.info(
                f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms")
            logging.info(
                f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms")

        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception(f"Error during inference:{str(ex)}")
Esempio n. 2
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args):
    face_detection_model_file = args.faceDetectionModel
    facial_landmarks_detection_model_file = args.facialLandmarksModel
    head_pose_estimation_model_file = args.headPoseModel
    gaze_estimation_model_file = args.gazeModel

    video_file = args.input
    device_name = args.device
    cpu_extension = args.cpu_extension
    prob_threshold = args.prob_threshold
    preview_flag = args.preview_flag

    output_path = args.output_path
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    mouse_control = MouseController("low", "fast")

    try:
        logging.info("*********** Model Load Time ***************")
        start_model_load_time = time.time()

        start_time = time.time()
        face_detection_model = FaceDetectionModel(face_detection_model_file,
                                                  device_name, cpu_extension)
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        facial_landmarks_detection_model = FacialLandmarksDetectionModel(
            facial_landmarks_detection_model_file, device_name, cpu_extension)
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        head_pose_estimation_model = HeadPoseEstimationModel(
            head_pose_estimation_model_file, device_name, cpu_extension)
        logging.info("Head Pose Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file,
                                                    device_name, cpu_extension)
        logging.info("Gaze Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        total_model_load_time = time.time() - start_model_load_time
        logging.info("*********** Model Load Completed ***********")
    except Exception as e:
        logging.error("ERROR in model loading: " + str(e))
        sys.exit(1)

    feeder = InputFeeder('video', video_file)
    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.fps() / 10), (1920, 1080), True)

    start_inference_time = 0
    frame_count = 0
    face_detect_infer_time = 0
    facial_landmarks_infer_time = 0
    head_pose_infer_time = 0
    gaze_infer_time = 0

    while True:
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break

        key_pressed = cv2.waitKey(60)
        frame_count += 1

        ## Face Detecton Model
        image = face_detection_model.preprocess_input(frame)

        start_time = time.time()
        outputs = face_detection_model.predict(image)
        face_detect_infer_time += (time.time() - start_time)
        out_frame, faces = face_detection_model.preprocess_output(
            outputs, frame, preview_flag, prob_threshold)

        for face in faces:
            crop_image = frame[face[1]:face[3], face[0]:face[2]]

            ## Facial Landmarks Detecton Model
            image = facial_landmarks_detection_model.preprocess_input(
                crop_image)

            start_time = time.time()
            outputs = facial_landmarks_detection_model.predict(image)
            facial_landmarks_infer_time += (time.time() - start_time)
            out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output(
                outputs, out_frame, face, preview_flag)

            ## Head Pose Estimation Model
            image = head_pose_estimation_model.preprocess_input(crop_image)

            start_time = time.time()
            outputs = head_pose_estimation_model.predict(image)
            head_pose_infer_time += (time.time() - start_time)
            out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output(
                outputs, out_frame, preview_flag)

            ## Gaze Estimation Model
            out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(
                out_frame, crop_image, left_eye_point, right_eye_point)

            start_time = time.time()
            outputs = gaze_estimation_model.predict(left_eye, right_eye,
                                                    headpose_angels_list)
            gaze_infer_time += (time.time() - start_time)
            out_frame, gazevector = gaze_estimation_model.preprocess_output(
                outputs, out_frame, face, left_eye_point, right_eye_point,
                preview_flag)

            cv2.imshow("Computer Pointer Control", out_frame)
            out_video.write(out_frame)
            mouse_control.move(gazevector[0], gazevector[1])

        if key_pressed == 27:
            break

    if frame_count > 0:
        logging.info("*********** Model Inference Time ****************")
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * face_detect_infer_time / frame_count))
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * facial_landmarks_infer_time / frame_count))
        logging.info("Head Pose Detection Model: {:.1f} ms.".format(
            1000 * head_pose_infer_time / frame_count))
        logging.info("Gaze Detection Model: {:.1f} ms.".format(
            1000 * gaze_infer_time / frame_count))
        logging.info("*********** Model Inference Completed ***********")

    total_infer_time = time.time() - start_inference_time
    total_inference_time = round(total_infer_time, 1)
    fps = frame_count / total_inference_time

    with open(os.path.join(output_path, 'stats.txt'), 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logging.info("*********** Total Summary ****************")
    logging.info(f"Total Model Load Time: {total_model_load_time}")
    logging.info(f"Total Inference Time: {total_inference_time}")
    logging.info(f"FPS: {fps}")
    logging.info("*********** Total Summary ***********")
    logging.info("*********** ************************* ***********")

    feeder.close()
    cv2.destroyAllWindows()