Example #1
0
def main():
    global face_detector, landmark_dectector, head_pose_estimator, gaze_estimator, mouse_controller
    # Load parameters
    app_start_time = cv2.getTickCount()

    set_logging()
    logging.info('start APP')
    pyautogui.FAILSAFE = False
    cmd_paras = get_comand_line_parameters()
    #cmd_paras = get_comand_lineFP16()

    # Setup Projects
    mouse_controller = createMouseController()

    # Setup Classes
    logging.info('setUp dectors and estimators started')

    # NC2 can only handel one instance of IECore
    plugin = IECore()
    face_detector = FaceDetector(cmd_paras.fd, cmd_paras.device, plugin)
    face_detector.load_model()
    landmark_dectector = FacialLandmarksDetector(cmd_paras.lr,
                                                 cmd_paras.device, plugin)
    landmark_dectector.load_model()
    head_pose_estimator = HeadPoseEstimator(cmd_paras.hp, cmd_paras.device,
                                            plugin)
    head_pose_estimator.load_model()
    gaze_estimator = GazeEstimatior(cmd_paras.ge, cmd_paras.device, plugin)
    gaze_estimator.load_model()
    logging.info('setUp dectors estimators ends')

    logging.info('start inputFeeder read stream')
    input_feeder = createInputFeeder(cmd_paras)
    input_feeder.load_data()

    # RUN the pipline
    i = 0
    try:
        for tmp_image in input_feeder.next_batch():
            if tmp_image is not None:
                i = i + 1
                tmp_image, gaze_position = pipline(tmp_image, i)
                cv2.imshow('frame', tmp_image)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                mouse_controller.move(gaze_position[0], gaze_position[1])
            else:
                break
    finally:
        input_feeder.close()
        logging.info('end inputFeeder end stream')

    app_end_time = cv2.getTickCount()
    logging.info('App Runtime: {:.4f} seconds pipline runs {:d}'.format(
        (app_end_time - app_start_time) / cv2.getTickFrequency(), i))
def main():
    # Get command line arguments
    args = parser.parse_args()
    device = args.device
    cpu_extensions = args.extensions
    threshold = args.threshold
    gaze_estimation_precision = args.gaze_estimation_precision
    head_pose_precision = args.head_pose_precision
    face_detection_precision = args.face_detection_precision
    landmarks_precision = args.landmarks_precision
    input_feeder = InputFeeder(args)
    control_mouse = MouseController(args)
    gaze_model = 'models/intel/gaze-estimation-adas-0002/{}/gaze-estimation-adas-0002'.format(
        gaze_estimation_precision)
    face_detector_model = 'models/intel/face-detection-adas-binary-0001/{}/face-detection-adas-binary-0001'.format(
        face_detection_precision)
    facial_landmark_model = 'models/intel/landmarks-regression-retail-0009/{}/landmarks-regression-retail-0009'.format(
        landmarks_precision)
    head_pose_model = 'models/intel/head-pose-estimation-adas-0001/{}/head-pose-estimation-adas-0001'.format(
        head_pose_precision)

    # Initialize the models
    face_detector = FaceDetector(face_detector_model, args)
    facial_landmarks = FacialLandmarksDetector(
        model_name=facial_landmark_model,
        device=device,
        extensions=cpu_extensions)
    head_pose_estimation = HeadPoseEstimation(model_name=head_pose_model,
                                              device=device,
                                              extensions=cpu_extensions)
    gaze_estimation = GazeEstimation(model_name=gaze_model,
                                     device=device,
                                     extensions=cpu_extensions)

    # Load the models
    start_time = time.time()
    face_detector.load_model()
    face_detector_loadtime = time.time() - start_time
    start_time = time.time()
    facial_landmarks.load_model()
    facial_landmark_loadtime = time.time() - start_time
    start_time = time.time()
    head_pose_estimation.load_model()
    head_pose_estimation_loadtime = time.time() - start_time
    start_time = time.time()
    gaze_estimation.load_model()
    gaze_estimation_loadtime = time.time() - start_time
    logging.info('FINISH LOADING MODELS')

    try:
        width, height = input_feeder.load_data()
    except TypeError:
        logging.error('Invalid file type.')
        return

    output_handler = OutputHandler(args)
    output_handler.initalize_video_writer(width, height)
    frame_count = 0
    start_time = 0
    capture = input_feeder.cap
    inputs = args.input
    if input_feeder.input_type == 'cam':
        inputs = 0
    else:
        capture.open(inputs)
    while capture.isOpened():
        flag, frame = capture.read()

        if start_time == 0:
            start_time = time.time()

        if inputs == 0 and time.time() - start_time >= 1:
            gaze_estimate = run_inference(frame, face_detector,
                                          facial_landmarks,
                                          head_pose_estimation,
                                          gaze_estimation, output_handler)
            if gaze_estimate is None:
                break

            if gaze_estimate[0][0]:
                x, y = gaze_estimate[0][:2]
                control_mouse.move(x, y)
            start_time = 0
            frame_count += 1
        elif not inputs == 0:
            gaze_estimate = run_inference(frame, face_detector,
                                          facial_landmarks,
                                          head_pose_estimation,
                                          gaze_estimation, output_handler)
            if gaze_estimate is None:
                break

            if gaze_estimate[0][0] and time.time() - start_time >= 0.5:
                x, y = gaze_estimate[0][:2]
                control_mouse.move(x, y)
                start_time = 0
            frame_count += 1

    input_feeder.close()
    logging.info('TOTOAL FRAMES PROCESSED: {}'.format(frame_count))
    logging.info('Time to load face detector model is {:.5f}'.format(
        face_detector_loadtime))
    logging.info('Time to load head pose estimation model is {:.5f}'.format(
        head_pose_estimation_loadtime))
    logging.info('Time to load facial landmarks model model is {:.5f}'.format(
        facial_landmark_loadtime))
    logging.info('Time to load gaze estimation model is {:.5f}'.format(
        gaze_estimation_loadtime))
Example #3
0
def infer_on_stream(args):
    try:
        log.basicConfig(
            level=log.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[log.FileHandler("app.log"),
                      log.StreamHandler()])

        mouse_controller = MouseController(precision="low", speed="fast")

        start_model_load_time = time.time()

        face_detector = FaceDetector(args.model_face_detection)
        facial_landmarks_detector = FacialLandmarksDetector(
            args.model_facial_landmarks_detection)
        head_pose_estimator = HeadPoseEstimator(
            args.model_head_pose_estimation)
        gaze_estimator = GazeEstimator(args.model_gaze_estimation)
        face_detector.load_model()
        facial_landmarks_detector.load_model()
        head_pose_estimator.load_model()
        gaze_estimator.load_model()

        total_model_load_time = time.time() - start_model_load_time
        log.info("Model load time: {:.1f}ms".format(1000 *
                                                    total_model_load_time))

        output_directory = os.path.join(args.output_path + '\\' + args.device)
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        feed = InputFeeder(args.input_type, args.input_path)
        feed.load_data()
        out_video = feed.get_out_video(output_directory)

        frame_counter = 0
        start_inference_time = time.time()
        total_prepocess_time = 0

        while True:
            try:
                frame = next(feed.next_batch())
            except StopIteration:
                break
            frame_counter += 1

            face_boxes = face_detector.predict(frame)
            for face_box in face_boxes:
                face_image = get_crop_image(frame, face_box)
                eye_boxes, eye_centers = facial_landmarks_detector.predict(
                    face_image)
                left_eye_image, right_eye_image = [
                    get_crop_image(face_image, eye_box)
                    for eye_box in eye_boxes
                ]
                head_pose_angles = head_pose_estimator.predict(face_image)
                gaze_x, gaze_y = gaze_estimator.predict(
                    right_eye_image, head_pose_angles, left_eye_image)
                draw_gaze_line(frame, face_box, eye_centers, gaze_x, gaze_y)
                if args.show_input:
                    cv2.imshow('im', frame)
                if args.move_mouse:
                    mouse_controller.move(gaze_x, gaze_y)
                total_prepocess_time += face_detector.preprocess_time + facial_landmarks_detector.preprocess_time + \
                    head_pose_estimator.preprocess_time + gaze_estimator.preprocess_time
                break

            if out_video is not None:
                out_video.write(frame)
            if args.input_type == "image":
                cv2.imwrite(os.path.join(output_directory, 'output_image.jpg'),
                            frame)

            key_pressed = cv2.waitKey(60)
            if key_pressed == 27:
                break

        total_time = time.time() - start_inference_time
        total_inference_time = round(total_time, 1)
        fps = frame_counter / total_inference_time
        log.info("Inference time:{:.1f}ms".format(1000 * total_inference_time))
        log.info("Input/output preprocess time:{:.1f}ms".format(
            1000 * total_prepocess_time))
        log.info("FPS:{}".format(fps))

        with open(os.path.join(output_directory, 'stats.txt'), 'w') as f:
            f.write(str(total_inference_time) + '\n')
            f.write(str(total_prepocess_time) + '\n')
            f.write(str(fps) + '\n')
            f.write(str(total_model_load_time) + '\n')

        feed.close()
        cv2.destroyAllWindows()
    except Exception as e:
        log.exception("Something wrong when running inference:" + str(e))
def main(args):
    print("Main script running...")
    log_name = 'stats_' + args.device + '_' + args.hpe + args.fld + args.ge

    if not os.path.exists('output'):
        os.makedirs('output')
    print(f"Logging to: output/{log_name}")
    log = open('output/' + log_name, 'w+')

    print("Initializing models...")

    fd = FaceDetector(
        model_name=
        'models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001',
        device=args.device,
        extensions=None)

    fd.load_model()

    if args.v: print(f"Face Detection Load Time: {fd.load_time}")

    hpe = HeadPoseEstimator(
        model_name=
        f'models/intel/head-pose-estimation-adas-0001/{args.hpe}/head-pose-estimation-adas-0001',
        device=args.device,
        extensions=None)
    hpe.load_model()

    if args.v: print(f"Head Pose Estimation Load Time: {hpe.load_time}")

    fld = FacialLandmarkDetector(
        model_name=
        f'models/intel/landmarks-regression-retail-0009/{args.fld}/landmarks-regression-retail-0009',
        device=args.device,
        extensions=None)
    fld.load_model()

    if args.v: print(f"Facial Landmarks Detection Load Time: {fld.load_time}")

    ge = GazeEstimator(
        model_name=
        f'models/intel/gaze-estimation-adas-0002/{args.ge}/gaze-estimation-adas-0002',
        device=args.device,
        extensions=None)
    ge.load_model()

    if args.v: print(f"Gaze Estimation Load Time: {ge.load_time}")

    image = False

    print("Initializing source feed...")
    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    if args.input_type == 'image':
        image = True

    feed.load_data()

    for batch in feed.next_batch():
        if args.v:
            print()
        cv2.imshow('Batch', batch)
        if image:
            cv2.imwrite('output/Batch.png', batch)

        coords, bounding_face = fd.predict(batch)
        if not coords:
            print("No face")
            continue
        if image: cv2.imwrite('output/Face.png', bounding_face)
        box = coords[0]
        face = bounding_face[box[1]:box[3], box[0]:box[2]]

        if args.v:
            print(f"Face Time: {fd.infer_time}")
        log.write("FD_infer: " + str(fd.infer_time) + "\n")
        if image:
            cv2.imshow('Cropped Face', face)

        # Landmark Detection
        coords, landmark_detection, landmark_points = fld.predict(face)
        if image: cv2.imwrite('output/Landmarks.png', landmark_detection)
        if image: cv2.imshow('Landmark Detection', landmark_detection)
        if args.v: print(f"Landmark Time: {fld.infer_time}")
        log.write("FLD_infer: " + str(fld.infer_time) + "\n")
        right_box, left_box = coords[0:2]
        if args.v: print(f"Eye Coords: {coords}")

        if left_box == None or right_box == None:
            print("No eyes")
            continue

        left_eye = face[left_box[1]:left_box[3], left_box[0]:left_box[2]]
        cv2.putText(face, 'L', (left_box[0], left_box[3]),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

        right_eye = face[right_box[1]:right_box[3], right_box[0]:right_box[2]]
        cv2.putText(face, 'R', (right_box[0], right_box[3]),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

        if args.v:
            print(f"Eye Shape: {left_eye.shape} :: {right_eye.shape}")

        #Head Pose Estimation
        head_yaw, head_pitch, head_roll = hpe.predict(face)
        if args.v: print(f"Head Pose Time: {hpe.infer_time}")
        log.write("HPE_infer: " + str(hpe.infer_time) + "\n")
        head_angles = [head_yaw[0][0], head_pitch[0][0], head_roll[0][0]]

        #Gaze Estimation
        # expects pose as  (yaw, pitch, and roll)
        gaze = ge.predict(left_eye, right_eye, head_angles)

        if args.v:
            print(f"Gaze Time: {ge.infer_time}")
        log.write("GE_infer: " + str(ge.infer_time) + "\n")
        gaze_point = (int(gaze[0][0] * 50), int(gaze[0][1] * 50))

        arrows = cv2.arrowedLine(face, landmark_points[0],
                                 (landmark_points[0][0] + gaze_point[0],
                                  landmark_points[0][1] - gaze_point[1]),
                                 (0, 0, 255), 2)
        arrows = cv2.arrowedLine(face, landmark_points[1],
                                 (landmark_points[1][0] + gaze_point[0],
                                  landmark_points[1][1] - gaze_point[1]),
                                 (0, 0, 255), 2)
        if image:
            cv2.imwrite('output/Gaze.png', arrows)

        if not image:
            mouse = MouseController(precision='medium', speed='medium')
            mouse.move(gaze[0][0], gaze[0][1])

        if image:
            cv2.imshow('Arrows', arrows)

        if image:
            log.write("FD_LoadTime: " + str(fd.load_time) + "\n")
            log.write("FD_PreprocessTime: " + str(fd.preprocess_input_time) +
                      "\n")
            log.write("FD_PostrocessTime: " + str(fd.preprocess_output_time) +
                      "\n")

            log.write("FLD_LoadTime: " + str(fld.load_time) + "\n")
            log.write("FLD_PreprocessTime: " + str(fld.preprocess_input_time) +
                      "\n")
            log.write("FLD_PostprocessTime: " +
                      str(fld.preprocess_output_time) + "\n")

            log.write("HPE_LoadTime: " + str(hpe.load_time) + "\n")
            log.write("HPE_PreprocessTime: " + str(hpe.preprocess_input_time) +
                      "\n")

            log.write("GE_LoadTime: " + str(ge.load_time) + "\n")
            log.write("GE_PreprocessTime: " + str(ge.preprocess_input_time) +
                      "\n")

            cv2.waitKey(0)
        else:
            if cv2.waitKey(15) & 0xFF == ord('q'):
                break

    feed.close()
    log.close()
    cv2.destroyAllWindows
Example #5
0
def main():
    args = build_argparser().parse_args()
    input_file = args.input
    logger = log.getLogger()
    if input_file == "CAM":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file):
            logger.error("Path should be file")
            exit(1)
        input_feeder = InputFeeder("video", input_file)

    face_detector = FaceDetector(
        args.face_detection_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    face_landmark_detector = FaceLandmarkDetector(
        args.face_landmark_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    head_pose_estimator = HeadPoseEstimator(
        args.head_pose_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    gaze_estimator = GazeEstimator(
        args.gaze_estimation_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    mouse_controller = MouseController("medium", "fast")

    face_detector.load_model()
    face_landmark_detector.load_model()
    head_pose_estimator.load_model()
    gaze_estimator.load_model()

    input_feeder.load_data()

    width = 1000
    height = int(width * 9 / 16)

    for flag, frame in input_feeder.next_batch():

        if not flag:
            break
        pressed_key = cv2.waitKey(60)

        face_detected = face_detector.predict(frame)
        if face_detected:
            face_coordinates, face_image = face_detected
            if not face_coordinates:
                continue
        else:
            continue
        if "fd" in args.visualization:
            cv2.rectangle(
                frame,
                (face_coordinates[0], face_coordinates[1]),
                (face_coordinates[2], face_coordinates[3]),
                (36, 255, 12),
                2,
            )
            cv2.putText(
                frame,
                "Face Detected",
                (face_coordinates[0], face_coordinates[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9,
                (36, 255, 12),
                2,
            )

        left_eye_img, righ_eye_img, eye_coords = face_landmark_detector.predict(
            face_image
        )
        if "fl" in args.visualization:
            frame_eye_coords_min = (
                np.array(eye_coords)[:, :2] + np.array(face_coordinates)[:2]
            )
            frame_eye_coords_max = (
                np.array(eye_coords)[:, 2:] + np.array(face_coordinates)[:2]
            )
            cv2.rectangle(
                frame,
                (frame_eye_coords_min[0][0], frame_eye_coords_min[0][1]),
                (frame_eye_coords_max[0][0], frame_eye_coords_max[0][1]),
                (36, 255, 12),
                2,
            )
            cv2.rectangle(
                frame,
                (frame_eye_coords_min[1][0], frame_eye_coords_min[1][1]),
                (frame_eye_coords_max[1][0], frame_eye_coords_max[1][1]),
                (36, 255, 12),
                2,
            )

        head_pose_estimate = head_pose_estimator.predict(face_image)
        if "hp" in args.visualization:
            cv2.putText(
                frame,
                "yaw:{:.1f}|pitch:{:.1f}|roll:{:.1f}".format(*head_pose_estimate),
                (20, 35),
                cv2.FONT_HERSHEY_COMPLEX,
                1.2,
                (36, 255, 12),
                3,
            )

        mouse_coordinate, gaze_vector = gaze_estimator.predict(
            left_eye_img, righ_eye_img, head_pose_estimate
        )
        if "ge" in args.visualization:
            head_pose_estimate = np.array(head_pose_estimate)
            yaw, pitch, roll = head_pose_estimate * np.pi / 180.0

            focal_length = 950
            scale = 100

            origin = (
                int(
                    face_coordinates[0]
                    + (face_coordinates[2] - face_coordinates[0]) / 2
                ),
                int(
                    face_coordinates[1]
                    + (face_coordinates[3] - face_coordinates[1]) / 2
                ),
            )

            r_x = np.array(
                [
                    [1, 0, 0],
                    [0, math.cos(pitch), -math.sin(pitch)],
                    [0, math.sin(pitch), math.cos(pitch)],
                ]
            )
            r_y = np.array(
                [
                    [math.cos(yaw), 0, -math.sin(yaw)],
                    [0, 1, 0],
                    [math.sin(yaw), 0, math.cos(yaw)],
                ]
            )
            r_z = np.array(
                [
                    [math.cos(roll), -math.sin(roll), 0],
                    [math.sin(roll), math.cos(roll), 0],
                    [0, 0, 1],
                ]
            )
            r = r_z @ r_y @ r_x

            zaxis = np.array(([0, 0, -1 * scale]), dtype="float32")
            offset = np.array(([0, 0, focal_length]), dtype="float32")
            zaxis = np.dot(r, zaxis) + offset
            tip = (
                int(zaxis[0] / zaxis[2] * focal_length) + origin[0],
                int(zaxis[1] / zaxis[2] * focal_length) + origin[1],
            )

            cv2.arrowedLine(frame, origin, tip, (0, 0, 255), 3, tipLength=0.3)

        cv2.imshow("frame", cv2.resize(frame, (width, height)))
        mouse_controller.move(mouse_coordinate[0], mouse_coordinate[1])

        if pressed_key == 27:
            logger.error("exit key is pressed..")
            break
Example #6
0
def main():
    args = get_args()

    log.basicConfig(filename='example.log', level=log.DEBUG)

    inputFile = args.input
    #inputFile = "./bin/demo.mp4"

    mouse = MouseController("high", "fast")

    frame_count = 0
    focal_length = 950.0
    scale = 50

    #print(f"Visual flag: {args.visual_flag}")

    if inputFile.lower() == "cam":
        feed = InputFeeder('cam')
        log.info("Video source: " + str(inputFile))

    else:
        if not os.path.isfile(inputFile):
            log.error("Unable to find file: " + inputFile)
            exit(1)
        feed = InputFeeder("video", inputFile)
        log.info("Video source: " + str(inputFile))
        log.info("InputFeeder initialized")

    log.info("Device: " + str(args.device))
    log.info("Face detection model: " + str(args.facedetectionmodel))
    log.info("Facial landmarks model: " + str(args.faciallandmarksmodel))
    log.info("Head pose estimation model: " + str(args.headposemodel))
    log.info("Gaze estimation model: " + str(args.gazeestimationmodel))

    if args.stats == 1:
        print("Running statistics...")
        inference_times = []
        fdm_inference_times = []
        hpm_inference_times = []
        flm_inference_times = []
        gem_inference_times = []
        start_time = time.time()

    # Create instances of the different models
    fdm = FaceDetector(args.facedetectionmodel, args.device,
                       args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        fdm.load_model()
        fdm_load_time = time.time() - start_time
    else:
        fdm.load_model()
    fdm.check_model()

    hpm = HeadPoseEstimator(args.headposemodel, args.device,
                            args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        hpm.load_model()
        hpm_load_time = time.time() - start_time
    else:
        hpm.load_model()
    hpm.check_model()

    flm = FacialLandmarksDetector(args.faciallandmarksmodel, args.device,
                                  args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        flm.load_model()
        flm_load_time = time.time() - start_time
    else:
        flm.load_model()
    flm.check_model()

    gem = GazeEstimator(args.gazeestimationmodel, args.device,
                        args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        gem.load_model()
        gem_load_time = time.time() - start_time
    else:
        gem.load_model()
    gem.check_model()

    if args.stats == 1:
        duration_loading = time.time() - start_time
        print(
            f"Duration for loading and checking the models: {duration_loading}"
        )
        log.info(
            f"Duration for loading and checking the models: {duration_loading}"
        )

    cv2.namedWindow('preview', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('preview', 600, 600)

    feed.load_data()
    for ret, frame in feed.next_batch():
        if not ret:
            break

        if frame is not None:
            frame_count += 1
            key = cv2.waitKey(60)

            if args.stats == 1:
                start_time = time.time()

            # Run face detection
            face_crop, face_coords = fdm.predict(frame.copy())
            print("Face crop shape: " + str(face_crop.shape))
            frame_h, frame_w = frame.shape[:2]
            (xmin, ymin, xmax, ymax) = face_coords
            face_frame = frame[ymin:ymax, xmin:xmax]
            #center_of_face = (xmin + face_frame.shape[1] / 2, ymin + face_frame.shape[0] / 2, 0) # 0 for colour channel
            #print("Center of face " + str(center_of_face))

            try:
                # Check if face was detected
                if type(face_coords) == int:
                    print("Unable to detect face")
                    if key == 27:
                        break
                    continue

                # Facial landmark detection
                left_eye_crop, right_eye_crop, landmarks, crop_coords = flm.predict(
                    face_crop.copy())
                #print("Landmarks" +str(landmarks))
                left_eye = (landmarks[0], landmarks[1])
                right_eye = (landmarks[2], landmarks[3])

                # Landmark position based on complete frame
                landmarks_viz = landmarks
                landmarks_viz[0] = landmarks_viz[0] + xmin
                landmarks_viz[1] = landmarks_viz[1] + ymin
                landmarks_viz[2] = landmarks_viz[2] + xmin
                landmarks_viz[3] = landmarks_viz[3] + ymin

                crop_coords_viz = (crop_coords[0] + xmin, crop_coords[1] +
                                   ymin, crop_coords[2] + xmin,
                                   crop_coords[3] + ymin, crop_coords[4] +
                                   xmin, crop_coords[5] + ymin,
                                   crop_coords[6] + xmin,
                                   crop_coords[7] + ymin)

                left_eye_viz = (landmarks_viz[0], landmarks_viz[1])
                right_eye_viz = (landmarks_viz[2], landmarks_viz[3])

                third_eye_viz_x = (landmarks_viz[2] -
                                   landmarks_viz[0]) / 2 + landmarks_viz[0]
                third_eye_viz_y = (landmarks_viz[3] -
                                   landmarks_viz[1]) / 2 + landmarks_viz[1]
                third_eye_viz = (third_eye_viz_x, third_eye_viz_y)
                #print(landmarks_viz[0], landmarks_viz[2], third_eye_viz_x)

                # Head pose estimation
                head_pose = hpm.predict(face_crop.copy())
                print("Head pose: " + str(head_pose))
                (yaw, pitch, roll) = head_pose
                frame = display_head_pose(frame, pitch, roll, yaw)

                # Send inputs to GazeEstimator
                gaze_vector = gem.predict(head_pose, left_eye_crop,
                                          right_eye_crop)

                if args.stats == 1:
                    inference_time = time.time() - start_time
                    inference_times.append(inference_time)

                print(gaze_vector)
                frame = display_gaze(frame, gaze_vector)

                # Control the mouse
                if frame_count % 5 == 0:
                    mouse_x, mouse_y = get_mouse_vector(gaze_vector, roll)
                    print("Mouse vector:" + str(mouse_x) + " - " +
                          str(mouse_y))
                    mouse.move(mouse_x, mouse_y)
                    currentMouseX, currentMouseY = pyautogui.position()
                    print("Mouse coordinates: " + str(currentMouseX) + ", " +
                          str(currentMouseY))

                if args.visual_flag == 1:

                    frame = draw_bounding_box(frame, face_coords)

                    left_eye_frame = crop_coords_viz[0:4]
                    right_eye_frame = crop_coords_viz[4:]
                    frame = draw_bounding_box(frame, left_eye_frame)
                    frame = draw_bounding_box(frame, right_eye_frame)

                    frame = visualize_landmark(frame, left_eye_viz)
                    frame = visualize_landmark(frame,
                                               right_eye_viz,
                                               color=(0, 0, 255))

                    frame = visualize_gaze(frame, gaze_vector, landmarks_viz)

                    # visualize the axes of the HeadPoseEstimator results
                    #frame = hpm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length)
                    frame = hpm.draw_axes(frame.copy(), third_eye_viz, yaw,
                                          pitch, roll, scale, focal_length)
                    #hdm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length)

                cv2.imshow('preview', frame)
                cv2.imshow('left eye', left_eye_crop)
                cv2.imshow('right eye', right_eye_crop)

            except Exception as e:
                print("Unable to predict using model" + str(e) +
                      " for frame " + str(frame_count))
                log.error("Unable to predict using model" + str(e) +
                          " for frame " + str(frame_count))
            continue

    if args.stats == 1:
        avg_inference_time = sum(inference_times) / len(inference_times)
        print("Average inference time: " + str(avg_inference_time))
        log.info("Average inference time: " + str(avg_inference_time))
        log.info("Load time for face detection model: " + str(fdm_load_time))
        log.info("Load time for facial landmarks model: " + str(flm_load_time))
        log.info("Load time for head pose detection model: " +
                 str(hpm_load_time))
        log.info("Load time for gaze estimation model: " + str(gem_load_time))
    cv2.destroyAllWindows()
    feed.close()