def load_all_models(args):
    model_path_dict = {
        'FaceDetectionModel': args.face_detection_model,
        'FacialLandmarksDetectionModel': args.facial_landmarks_model,
        'GazeEstimationModel': args.gaze_estimation_model,
        'HeadPoseEstimationModel': args.head_pose_model
    }
    for fileNameKey in model_path_dict.keys():
        if not os.path.isfile(model_path_dict[fileNameKey] + ".xml"):
            logging.error("Unable to find specified " + fileNameKey +
                          " xml file")
            exit(1)
    fd_model = FaceDetectionModel(model_path_dict['FaceDetectionModel'],
                                  args.threshold, args.device,
                                  args.cpu_extension)
    fld_model = FacialLandmarksDetectionModel(
        model_path_dict['FacialLandmarksDetectionModel'], args.threshold,
        args.device, args.cpu_extension)
    ge_model = GazeEstimationModel(model_path_dict['GazeEstimationModel'],
                                   args.threshold, args.device,
                                   args.cpu_extension)
    hpe_model = HeadPoseEstimationModel(
        model_path_dict['HeadPoseEstimationModel'], args.threshold,
        args.device, args.cpu_extension)
    start_time = time.time()

    fd_model.load_model()
    fld_model.load_model()
    ge_model.load_model()
    hpe_model.load_model()

    total_model_load_time = time.time() - start_time
    return fd_model, fld_model, ge_model, hpe_model, total_model_load_time
    def __init__(self,
                 device='CPU',
                 mouse_con=False,
                 face_dec=None,
                 fac_land=None,
                 head_pose=None,
                 gaze=None,
                 show_video=False,
                 save_video=False):
        '''
        all models should be put in here 
        '''
        if face_dec and fac_land and head_pose and gaze:
            self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel(
                face_dec, device=device), FacialLandmarksDetection(
                    fac_land, device=device), Head_Pose_Estimation(
                        head_pose,
                        device=device), Gaze_Estimation(gaze, device=device)
            self.face_dec.load_model()
            self.fac_land.load_model()
            self.head_pose.load_model()
            self.gaze.load_model()
        else:
            raise ValueError('Missing Arguments')

        if mouse_con:
            self.mouse_con = MouseController("low", "fast")

        self.show_video, self.save_video = show_video, save_video
def init_models(args, logger):

    face_detec = None
    fac_land = None
    head_pose = None
    gaze_est = None

    models_info = {
        'face_detection_model': args.face_detection_model,
        'facial_landmarks_detection_model':
        args.facial_landmarks_detection_model,
        'head_pose_estimation_model': args.head_pose_estimation_model,
        'gaze_estimation_model': args.gaze_estimation_model
    }

    for model_name in models_info.keys():
        if not os.path.isfile(models_info[model_name]):
            logger.error("Unable to find the model file of " + str(model_name))
            exit(1)

    #Init classes
    face_detec = FaceDetectionModel(models_info['face_detection_model'],
                                    args.device, args.cpu_extension,
                                    args.threshold)
    fac_land = FacialLandmarksDetectionModel(
        models_info['facial_landmarks_detection_model'], args.device,
        args.cpu_extension)
    head_pose = HeadPoseEstimationModel(
        models_info['head_pose_estimation_model'], args.device,
        args.cpu_extension)
    gaze_est = GazeEstimationModel(models_info['gaze_estimation_model'],
                                   args.device, args.cpu_extension)

    return face_detec, fac_land, head_pose, gaze_est
Exemple #4
0
def model_instants(args):

    face_detection_instant = FaceDetectionModel(model_name=args.face_detection,
                                                device=args.device,
                                                threshold=args.prob_threshold,
                                                extensions=args.cpu_extension)

    head_pose_estimation_instant = HeadPoseEstimationModel(
        model_name=args.head_pose_estimation,
        device=args.device,
        extensions=args.cpu_extension)

    facial_landmarks_instant = FacialLandmarksDetectionModel(
        model_name=args.facial_landmarks_detection,
        device=args.device,
        extensions=args.cpu_extension)

    gaze_estimation_instant = GazeEstimationModel(
        model_name=args.gaze_estimation,
        device=args.device,
        extensions=args.cpu_extension)

    mouse_controller_instant = MouseController('medium', 'fast')

    return face_detection_instant, head_pose_estimation_instant, facial_landmarks_instant, gaze_estimation_instant, mouse_controller_instant
def main(args):
    feed = InputFeeder(input_type=args.it, input_file=args.i)

    face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p))
    face_model.load_model()

    landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c)
    landmarks_model.load_model()

    headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c)
    headpose_model.load_model()

    gaze_model = GazeEstimationModel(args.gem, args.d, args.c)
    gaze_model.load_model()

    mouse = MouseController("medium", "fast")

    feed.load_data()
    for batch in feed.next_batch():
        # try:
        cropped_face, coords, _ = face_model.predict(batch)
        cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]),
                      (255, 0, 0), 2)

        left_eye, right_eye, eyes_coords, _ = landmarks_model.predict(
            cropped_face)

        head_pose_angles, _ = headpose_model.predict(cropped_face)
        x, y, z, _ = gaze_model.predict(left_eye, right_eye, head_pose_angles,
                                        cropped_face, eyes_coords)

        mouse.move(x, y)

        cv2.imshow("img", batch)
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
    # except:
    #     print("Frame without prediction. Error: ", sys.exc_info()[0])
    #     log.error(sys.exc_info()[0])
    feed.close()
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.DEBUG,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("debug.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        start_time = time.time()
        fdnet.load_model()
        logging.info(
            f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms")

        start_time = time.time()
        lmnet.load_model()
        logging.info(
            f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        hpnet.load_model()
        logging.info(
            f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        genet.load_model()
        logging.info(
            f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()

        frame_count = 0

        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0

        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fd_output = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, bboxes = fdnet.preprocess_output(
                fd_output, frame, args.print)

            for bbox in bboxes:

                face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lm_output = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lm_output, bbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hp_output = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angles = hpnet.preprocess_output(
                    hp_output, out_frame, face, bbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                ge_output = genet.predict(left_eye, right_eye, headpose_angles)
                ge_infertime += time.time() - start_time
                out_frame, gaze_vector = genet.preprocess_output(
                    ge_output, out_frame, bbox, left_eye_point,
                    right_eye_point, args.print)

                if not args.no_video:
                    cv2.imshow('image', out_frame)

                if not args.no_move:
                    mc.move(gaze_vector[0], gaze_vector[1])

                break

            if key_pressed == 27:
                break

        if frame_count > 0:
            logging.info(
                f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms")
            logging.info(
                f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms"
            )
            logging.info(
                f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms")
            logging.info(
                f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms")

        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception(f"Error during inference:{str(ex)}")
def main():
    # Grab command line args
    args = build_args().parse_args()
    # Config Logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()
    #os.system('clear')
    print("\n")
    logger.info("starting app ...")
    print("\n==========<COMPUTER POINTER CONTROLLER>==========")
    print("============>(c) Ibrahim Ishaka 2020<============\n")

    # initialize model object for each class
    FDModel = FaceDetectionModel(model=args.face_detection_model,
                                 device=args.device,
                                 extensions=args.extension,
                                 threshold=args.prob_threshold)
    FLDModel = FacialLandmarksDetectionModel(model=args.facial_landmark_model,
                                             device=args.device,
                                             extensions=args.extension)
    HPEModel = HeadPoseEstimationModel(model=args.head_pose_model,
                                       device=args.device,
                                       extensions=args.extension)
    GEModel = GazeEstimationModel(model=args.gaze_estimation_model,
                                  device=args.device,
                                  extensions=args.extension)

    models = {'fd': FDModel, 'fl': FLDModel, 'hp': HPEModel, 'ge': GEModel}

    models_loading_time = 0
    for k in models:
        # load model
        logger.info("Loading {} Model".format(models[k].model_name))
        model_loading_start = time.time()
        models[k].load_model()
        model_loading_finish = (time.time() - model_loading_start)
        models_loading_time = models_loading_time + model_loading_finish
        logger.info("time taken to load Model: {:.3f}secs".format(
            model_loading_finish))

        # check if model output visualization is specified in sh arg
        if k in args.show_output or args.show_output == 'all':
            models[k].show = True
        logger.info("show {} outputs: {} \n".format(models[k].model_name,
                                                    models[k].show))

    logger.info("time taken to load All Models: {:.3f}secs\n".format(
        models_loading_time))

    # setting for mouse controller
    _precision = "medium"
    _speed = "fast"
    mouse_controller = MouseController(precision=_precision, speed=_speed)

    # verify and handle input stream
    input_source = args.input
    input_feeder = None
    input_type = ""
    if input_source.lower() != "cam":
        # check if input file exist
        if os.path.exists(input_source) and os.path.isfile(input_source):
            image_formats = [".png", ".jpg", ".bmp", ".jpeg"]
            is_image = [
                True for x in image_formats if input_source.endswith(x)
            ]
            if is_image:
                input_type = "image"
            else:
                input_type = "video"
            input_feeder = InputFeeder(input_type=input_type,
                                       input_file=input_source)
        else:
            logger.error("Input file is not a file, or does't exist")
            sys.exit(1)
    elif input_source.lower() == "cam":
        input_type = "cam"
        input_feeder = InputFeeder(input_type=input_type)

    input_feeder.load_data()
    frame_count = 0
    total_inference_time_all = 0
    window_closed = False

    for flag, frame in input_feeder.next_batch():
        if flag is False:
            # no frame to read
            break
        frame_count = frame_count + 1
        key_pressed = cv2.waitKey(60)

        if input_source == 'cam':
            # preprocess frame as webcam is backwards/inverted
            frame = cv2.flip(frame, 1)

        face_detection_result = FDModel.predict(frame)
        # The prediction result should return None, if no face detected
        if face_detection_result is None:
            if not window_closed:
                cv2.imshow(input_type, cv2.resize(frame, (500, 500)))
            logger.info("NO FACE DETECTED... skipping")
            continue
        cropped_face = face_detection_result[0]
        face_coords = face_detection_result[1]
        hp_result = HPEModel.predict(cropped_face)
        left_eye, right_eye = FLDModel.predict(cropped_face)
        new_mouse_coords, gaze_vector = GEModel.predict(
            left_eye, right_eye, hp_result)

        total_inference_time = 0
        for key in models:
            total_inference_time = total_inference_time + models[
                key].inference_time
            total_inference_time_all = total_inference_time_all + total_inference_time

        #uncomment the following line to see the inference time for each frame
        #logger.info("Inference Time : {:.3f}".format(total_inference_time))

        try:
            x, y = new_mouse_coords
        except:
            logger.error(
                "unable to get mouse coordinates for current frame\nReading Next Frame..."
            )
            continue

        if GEModel.show == True:
            GEModel.show_gaze(left_eye, right_eye, gaze_vector)
        if HPEModel.show == True:
            frame = HPEModel.show_hp(frame, hp_result)

        if new_mouse_coords is None:
            # Error during LR_eyes processing
            continue
        '''
        wait on before moving mouse again
        this is recomended to avoid failsafe exception
        but you change this setting
        '''
        if input_type == "image":
            cv2.imshow(input_type, cv2.resize(frame, (500, 500)))
            mouse_controller.move(x, y)
            break

        if frame_count % 5 == 0:
            try:
                logger.info("changing mouse position... moving")
                mouse_controller.move(x, y)
            except pyautogui.FailSafeException:
                logger.error("safe exception From pyautogui")
                continue

        if not window_closed:
            cv2.imshow(input_type, cv2.resize(frame, (500, 500)))

        # Break if escape key pressed
        if key_pressed == 27:
            break

        # close the OpenCV window if q key pressed
        if key_pressed == ord('q'):
            window_closed = True
            cv2.destroyWindow(input_type)
            logger.info(input_type +
                        " window closed... to exit app, press CTRL+Z")

    if frame_count != 0:
        # Release the capture and destroy any OpenCV window
        input_feeder.close()
        cv2.destroyAllWindows()

        logger.info("Stream ended !")

        fps = round(frame_count / total_inference_time_all, 2)
        print("\n==========SUMMARY===========")
        print("models loading time  : ", round(models_loading_time, 2))
        print("frames per seconds   : ", fps)
        print("total inference time : ", round(total_inference_time_all, 2))
        print("============================")

    else:
        logger.error("Unable to handle Unsupported file ")
        sys.exit(1)
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    # --- INPUT ---
    # Initialize the input_type
    input_type = None

    # Check if the input is a webcam
    if args.input == 'CAM':
        input_type = 'cam'

    # Check if the input is an image
    elif args.input.endswith(('.jpg', '.bmp', '.png')):
        input_type = 'image'

    # Check if the input is a video
    elif args.input.endswith(('.mp4', '.avi')):
        input_type = 'video'

    else:
        sys.exit(
            f"[ ERRO ] The format of the input file '{args.input.endswith}' is not supported."
        )

    # Initialize the InputFeeder
    input_feeder = InputFeeder(input_type, args.input)
    input_feeder.load_data()

    # --- MODELS ---
    # Load the Face Detection Model
    face_detection_model = FaceDetectionModel(
        model_xml_path=args.model_face_detection,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    face_detection_model.load_model()

    # Load the Head Pose Estimation Model
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_xml_path=args.model_head_pose,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    head_pose_estimation_model.load_model()

    # Load the Facial Landmarks Detection Model
    facial_landmarks_detection_model = FacialLandmarksDetectionModel(
        model_xml_path=args.model_face_landmark,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    facial_landmarks_detection_model.load_model()

    # Load the Gaze Estimation Model
    gaze_estimation_model = GazeEstimationModel(
        model_xml_path=args.model_gaze_estimation,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    gaze_estimation_model.load_model()

    # --- POINTER CONTROLLER ---
    pointer_controller = MouseController(
        precision='medium',
        speed='medium',
    )

    # --- WINDOW ---
    # Set the window to fullscreen
    # cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
    # cv2.setWindowProperty(WINDOW_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # Initialize list to track the inference time
    list_inference_time = []

    #Loop until stream is over
    for frame in input_feeder.next_batch():
        # If there is no frame break the loop
        if frame is None:
            break

        # start the timer
        start_time = time.time()

        # Initialize the frame to be displayed
        display_frame = frame

        # --- DETECT HEAD ---
        # Detect the head on the frame
        list_heads = face_detection_model.predict(frame)

        # Draw the outputs of the head detection algorithm
        if args.display_outputs:
            display_frame = face_detection_model.display_output(
                frame, list_heads)

        # --- HEAD POSE ESTIMATION ---
        # Extract the roi of the head with the highest confidence score
        head = list_heads[0]
        head_x_max = head.x + head.w
        head_y_max = head.y + head.h

        head_roi = frame[head.y:head_y_max, head.x:head_x_max, :]

        # Estimate the pose of the best head
        head_angles = head_pose_estimation_model.predict(head_roi)

        # Draw the pose of the best head
        if args.display_outputs:
            display_head_pose = head_pose_estimation_model.display_output(
                head_roi, head_angles)
            display_frame[head.y:head_y_max,
                          head.x:head_x_max, :] = display_head_pose

        # --- FACIAL LANDMARKS DETECTION ---
        # Detect the facial landmarks on the head with the highest confidence score
        face_landmarks = facial_landmarks_detection_model.predict(head_roi)

        # Draw the facial landmarks of the best head
        if args.display_outputs:
            # Set display_name to True to display the name of the landmarks
            display_facial_landmarks = facial_landmarks_detection_model.display_output(
                display_head_pose, face_landmarks, display_name=True)
            display_frame[head.y:head_y_max,
                          head.x:head_x_max, :] = display_facial_landmarks

        # --- GAZE ESTIMATION ---
        # Calculate the eye ROI size
        eye_roi_size = int(head_roi.shape[1] / 3)

        # Extract the roi of the left eyes
        left_eye_roi, left_eye_bbox = extract_landmark_roi(
            name='left_eye',
            landmarks=face_landmarks,
            roi_size=eye_roi_size,
            image=frame,
            origin_x=head.x,
            origin_y=head.y,
        )

        # Extract the roi of the Rigth eyes
        right_eye_roi, right_eye_bbox = extract_landmark_roi(
            name='right_eye',
            landmarks=face_landmarks,
            roi_size=eye_roi_size,
            image=frame,
            origin_x=head.x,
            origin_y=head.y,
        )

        # Predict the gaze
        gaze_vector = gaze_estimation_model.predict(
            left_eye_image=left_eye_roi,
            right_eye_image=right_eye_roi,
            head_angles=head_angles,
        )

        # normalize the gaze vector based on the left eye
        left_eye_x_center = left_eye_bbox.x + int(left_eye_bbox.w / 2)
        left_eye_y_center = left_eye_bbox.y + int(left_eye_bbox.h / 2)
        start_vector = np.array([left_eye_x_center, left_eye_y_center, 0])

        end_vector = np.array([
            left_eye_x_center + gaze_vector.x,
            left_eye_y_center - gaze_vector.y, 0 + gaze_vector.z
        ])

        vector = end_vector - start_vector
        norm_gaze_vector = vector / np.sqrt(np.dot(vector, vector))

        # Draw the gaze output and the eyes ROI
        if args.display_outputs:
            # draw the bbox around each eyes
            display_frame = face_detection_model.display_output(
                display_frame,
                [left_eye_bbox, right_eye_bbox],
                color=(255, 255, 255),
                display_conf=False,
            )

            # draw the gaze from both eyes
            display_frame = gaze_estimation_model.display_output(
                display_frame,
                norm_gaze_vector,
                [left_eye_bbox, right_eye_bbox],
            )

        # Update position of the Computer Pointer
        if not args.disable_pointer_controller:
            pointer_controller.move(gaze_vector.x, gaze_vector.y)

        # Calculate the inference time
        stop_time = time.time()
        list_inference_time.append(stop_time - start_time)

        # Calculate and print the FPS
        fps = round(1 / (stop_time - start_time), 2)
        cv2.rectangle(display_frame, (10, 2), (120, 20), (255, 255, 255), -1)
        cv2.putText(display_frame, f"{fps} FPS", (15, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))

        # Display the frame
        cv2.imshow(WINDOW_NAME, display_frame)

        # Wait for 'ESC' or 'q' to exit the program
        keyboard = cv2.waitKey(30)
        if keyboard == 'q' or keyboard == 27:
            break

    # Release the input feeder
    input_feeder.close()

    # Destroy any OpenCV windows
    cv2.destroyAllWindows()

    # Display the average inference time and fps
    average_fps = round(1 / (mean(list_inference_time)), 2)
    print(
        f"[ INFO ] Average inference time was {mean(list_inference_time)}s ({average_fps} FPS)."
    )

    print(f"[ INFO ] Successfully exited the program.")
def main():

    # Grab command line args
    args = build_argparser().parse_args()

    inputFilePath = args.input
    inputFeeder = None

    if args.input == "CAM":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video", args.input)

    modelPathDict = {
        'FaceDetectionModel': args.face_detection_model,
        'FacialLandmarksDetectionModel': args.facial_landmark_model,
        'GazeEstimationModel': args.gaze_estimation_model,
        'HeadPoseEstimationModel': args.head_pose_model
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified " + fileNameKey + " xml file")
            sys.exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            log.info("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)
        '''
        if (not len(previewFlags)==0):
            preview_frame = frame.copy()
            if 'fd' in previewFlags:
                #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                preview_frame = croppedFace
            if 'fld' in previewFlags:
                cv2.rectangle(croppedFace, (eye_coords[0][0]-10, eye_coords[0][1]-10), (eye_coords[0][2]+10, eye_coords[0][3]+10), (0,255,0), 3)
                cv2.rectangle(croppedFace, (eye_coords[1][0]-10, eye_coords[1][1]-10), (eye_coords[1][2]+10, eye_coords[1][3]+10), (0,255,0), 3)
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                
            if 'hp' in previewFlags:
                cv2.putText(preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(hp_out[0],hp_out[1],hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in previewFlags:
                x, y, w = int(gaze_vector[0]*12), int(gaze_vector[1]*12), 160
                le =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2)
                cv2.line(le, (x-w, y+w), (x+w, y-w), (255,0,255), 2)
                re = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2)
                cv2.line(re, (x-w, y+w), (x+w, y-w), (255,0,255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],eye_coords[1][0]:eye_coords[1][2]] = re
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                
            cv2.imshow("visualization",cv2.resize(preview_frame,(500,500)))
        '''
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    log.info("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
Exemple #10
0
def main(args):
    # models
    face_detection_model = args.face_detection_model
    head_pose_estimation_model = args.head_pose_estimation_model
    facial_landmarks_detection_model = args.facial_landmarks_detection_model
    gaze_estimation_model = args.gaze_estimation_model

    # toggles
    toggle_face_detect = int(args.toggle_face_detect)
    toggle_eye_detection = int(args.toggle_eye_detection)
    toggle_head_pose_euler_angles = int(args.toggle_head_pose_euler_angles)
    toggle_gaze_estimation_direction_lines = int(
        args.toggle_gaze_estimation_direction_lines)

    device = args.device
    video_file = args.video
    threshold = args.threshold
    output_path = args.output_path

    # model load times
    fd_start_model_load_time = time.time()
    fd = FaceDetectionModel(face_detection_model, device, threshold)
    fd.load_model()
    fd_total_model_load_time = time.time() - fd_start_model_load_time

    fld_start_model_load_time = time.time()
    fld = FacialLandmarksDetectionModel(facial_landmarks_detection_model,
                                        device)
    fld.load_model()
    fld_total_model_load_time = time.time() - fld_start_model_load_time

    hpe_start_model_load_time = time.time()
    hpe = HeadPoseEstimationModel(head_pose_estimation_model, device)
    hpe.load_model()
    hpe_total_model_load_time = time.time() - hpe_start_model_load_time

    ge_start_model_load_time = time.time()
    ge = GazeEstimationModel(gaze_estimation_model, device)
    ge.load_model()
    ge_total_model_load_time = time.time() - ge_start_model_load_time

    # mouse controller
    mouse_controller = MouseController('medium', 'fast')

    # Handle the input stream
    # see https://github.com/anvillasoto/people-counter-edge-application/blob/master/main.py
    if video_file == 'CAM':
        input_stream = 0
        single_image_mode = False
    # Checks for input image
    elif video_file.endswith('.jpg') or video_file.endswith('.bmp'):
        single_image_mode = True
        input_stream = video_file
    elif (not video_file.endswith('.jpg')) or (
            not (video_file.endswith('.bmp'))):
        input_stream = video_file
        assert os.path.isfile(video_file), "Input file does not exist"
    else:
        input_stream = video_file
        log.error("The file is unsupported.please pass a supported file")

    try:
        cap = cv2.VideoCapture(input_stream)
    except Exception as e:
        log.error(f"Something else went wrong with the video file: {e}")

    initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join(output_path, 'output.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'), fps,
                                (initial_w, initial_h), True)

    counter = 0
    start_inference_time = time.time()

    try:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            counter += 1

            # detect face
            face_location, image = fd.predict(frame, toggle_face_detect)
            xmin, ymin, xmax, ymax = face_location[0]
            face_image = image[ymin:ymax, xmin:xmax].copy()

            # detect eyes
            eye_locations, eye_images, face_image_drawn = fld.predict(
                face_image, toggle_eye_detection)

            # detect head pose
            head_pose_angles, face_image_drawn = hpe.predict(
                face_image, face_image_drawn, toggle_head_pose_euler_angles)

            # gaze estimation
            gaze_vector, face_image_drawn = ge.predict(
                face_image_drawn, eye_images, head_pose_angles, eye_locations,
                toggle_gaze_estimation_direction_lines)

            # replace face with face image drawn (depending on toggle)
            image[ymin:ymax, xmin:xmax] = face_image_drawn

            x, y, z = gaze_vector

            if toggle_gaze_estimation_direction_lines == 1:
                # frame message to add gaze vector x, y, and z
                frame_message = "Gaze Coordinates: {:.2f}, {:.2f}, {:.2f}".format(
                    x, y, z)
                image = cv2.putText(image, frame_message, (20, 20),
                                    cv2.FONT_HERSHEY_COMPLEX, 1,
                                    COLOR_WHITE_BGR, 2)

            out_video.write(image)

            # move mouse after five frames
            if counter % 5 == 0:
                mouse_controller.move(x, y)

        total_time = time.time() - start_inference_time
        total_inference_time = round(total_time, 1)
        fps = counter / total_inference_time

        with open(os.path.join(output_path, 'stats.txt'), 'w') as f:
            f.write("Total Inference Time for Four Models: " +
                    str(total_inference_time) + '\n')
            f.write("Frames Per Second for Four Models: " + str(fps) + '\n\n')
            f.write("Model Load Time (Face Detection): " +
                    str(fd_total_model_load_time) + '\n')
            f.write("Model Load Time (Facial Landmark Detection): " +
                    str(fld_total_model_load_time) + '\n')
            f.write("Model Load Time (Head Pose Estimation): " +
                    str(hpe_total_model_load_time) + '\n')
            f.write("Model Load Time (Gaze Estimation): " +
                    str(ge_total_model_load_time) + '\n')

        cap.release()
        cv2.destroyAllWindows()
    except Exception as e:
        print("Could not run Inference: ", e)
Exemple #11
0
def main():
    """
    Initialise the inference network, stream video to network
    and output stats and video
    :param args: Command line arguments parsed by build_argsparser()
    :return: None
    """
    # mouse movement ("low", "medium", "fast")
    global POSE_CHECKED
    mouse_movement = MouseController("low", "fast")

    logging.basicConfig(format="[ %(levelname)s ] %(message)s",
                        level=logging.INFO,
                        stream=sys.stdout)
    args = args_parser().parse_args()
    logging_message = logging.getLogger()

    if args.input == 'cam':
        input_feed = 0
    else:
        input_feed = args.input
        assert os.path.isfile(
            args.input
        ), "Missing files or Specified input file doesn't exist or entered correctly"

    # Ref: source code: https://stackoverflow.com/questions/33834708/cant-write-video-by-opencv-in-python/33836463
    # Ref: source code: https://knowledge.udacity.com/questions/275173
    cap = cv2.VideoCapture(input_feed)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    duration = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    vout = cv2.VideoWriter(os.path.join(args.out_dir, "vout.mp4"),
                           cv2.VideoWriter_fourcc(*"MP4V"), fps,
                           (width, height), True)

    if args.save_output == 'yes':
        vout_fd = cv2.VideoWriter(os.path.join(args.out_dir, "vout_fd.mp4"),
                                  cv2.VideoWriter_fourcc(*"MP4V"), fps,
                                  (width, height), True)
        vout_fl = cv2.VideoWriter(os.path.join(args.out_dir, "vout_fl.mp4"),
                                  cv2.VideoWriter_fourcc(*"MP4V"), fps,
                                  (width, height), True)
        vout_hp = cv2.VideoWriter(os.path.join(args.out_dir, "vout_hp.mp4"),
                                  cv2.VideoWriter_fourcc(*"MP4V"), fps,
                                  (width, height), True)
        vout_ge = cv2.VideoWriter(os.path.join(args.out_dir, "vout_g.mp4"),
                                  cv2.VideoWriter_fourcc(*"MP4V"), fps,
                                  (width, height), True)

    box_count = 0

    working = 1

    infer_time_start = time.time()

    if input_feed:
        cap.open(args.input)
        # Adjust delays to match the number of Frame Per Seconds in the video file

    if not cap.isOpened():
        logging_message.error("ERROR MESSAGE! Corrupt video file")
        return

    if args.mode == 'sync':
        async_mode = False
    else:
        async_mode = True

    # Initialising the class variables
    # ref: https://github.com/gauravshelangia/computer-pointer-controller/blob/master/src/main.py
    if args.cpu_extension:
        fd_model = FaceDetectionModel(args.fdmodel,
                                      args.threshold,
                                      extensions=args.cpu_extension,
                                      async_mode=async_mode)
        hp_model = HeadPoseEstimationModel(args.hpmodel,
                                           args.threshold,
                                           extensions=args.cpu_extension,
                                           async_mode=async_mode)
        fl_model = FaceLandmarksDetectionModel(args.flmodel,
                                               args.threshold,
                                               extensions=args.cpu_extension,
                                               async_mode=async_mode)
        ge_model = GazeEstimationModel(args.gemodel,
                                       args.threshold,
                                       extensions=args.cpu_extension,
                                       async_mode=async_mode)
    else:
        fd_model = FaceDetectionModel(args.fdmodel,
                                      args.threshold,
                                      async_mode=async_mode)
        hp_model = HeadPoseEstimationModel(args.hpmodel,
                                           args.threshold,
                                           async_mode=async_mode)
        fl_model = FaceLandmarksDetectionModel(args.flmodel,
                                               args.threshold,
                                               async_mode=async_mode)
        ge_model = GazeEstimationModel(args.gemodel,
                                       args.threshold,
                                       async_mode=async_mode)

    # Load the model through ##
    # And infer network
    logging_message.info(
        "================ Models loading time ======================")
    start_time = time.time()
    fd_model.load_model()
    logging_message.info("Face Detection Model: {:.1f}ms".format(
        1000 * (time.time() - start_time)))

    start_time = time.time()
    hp_model.load_model()
    logging_message.info("Headpose Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start_time)))

    start_time = time.time()
    fl_model.load_model()
    logging_message.info("Facial Landmarks Detection Model: {:.1f}ms".format(
        1000 * (time.time() - start_time)))

    start_time = time.time()
    ge_model.load_model()
    logging_message.info("Gaze Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start_time)))
    logging_message.info(
        "========================== End ============================")

    model_load_time = time.time() - infer_time_start

    logging.info("All models are loaded successfully")

    while cap.isOpened():
        flag, img_frame = cap.read()
        if not flag:
            print("checkpoint *UNRECORDED")
            break

        box_count += 1
        gazing = 0
        POSE_CHECKED = False

        if img_frame is None:
            logging.error("checkpoint ERROR! EMPTY FRAME")
            break

        width = int(cap.get(3))
        height = int(cap.get(4))

        # Asynchronous Request
        inf_start_fd = time.time()

        # Display the results of the output layer of the model network
        # ref source code: https://knowledge.udacity.com/questions/285095
        values, img_frame = fd_model.predict(img_frame)

        if args.save_output == 'yes':
            vout_fd.write(img_frame)

        fd_dur_time = time.time() - inf_start_fd

        if len(values) > 0:
            [xmin, ymin, xmax, ymax] = values[0]
            head_is_moving = img_frame[ymin:ymax, xmin:xmax]
            inf_start_hp = time.time()
            person_in_frame, target_gaze = hp_model.predict(head_is_moving)
            if args.save_output == 'yes':
                p = "Target Gaze {}, Person in Frame? {}".format(
                    target_gaze, person_in_frame)
                cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5,
                            (0, 0, 255), 2)
                vout_hp.write(img_frame)

            if person_in_frame:
                hp_dur_time = time.time() - inf_start_hp
                POSE_CHECKED = True
                inf_start_fl = time.time()
                values, marking = fl_model.predict(head_is_moving)

                img_frame[ymin:ymax, xmin:xmax] = marking

                if args.save_output == "yes":
                    vout_fl.write(img_frame)

                fl_dur_time = time.time() - inf_start_fl
                [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax,
                                                yrmax]] = values
                l_eye_img = marking[ylmin:ylmax, xlmin:xlmax]
                r_eye_img = marking[yrmin:yrmax, xrmin:xrmax]

                output, gaze_vector = ge_model.predict(l_eye_img, r_eye_img,
                                                       target_gaze)
                #ref: source code: https://knowledge.udacity.com/questions/264973
                if args.save_output == 'yes':
                    p = "Gaze Vector {}".format(gaze_vector)
                    cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX,
                                0.5, (255, 0, 0), 1)
                    left_frame = draw_gaze(l_eye_img, gaze_vector)
                    right_frame = draw_gaze(r_eye_img, gaze_vector)
                    marking[ylmin:ylmax, xlmin:xlmax] = left_frame
                    marking[yrmin:yrmax, xrmin:xrmax] = right_frame
                    # cv2.arrowedLine(f, (xlmin, ylmin), (xrmin, yrmin), (0,0,255), 5)
                    vout_ge.write(img_frame)

                if box_count % 10 == 0:
                    mouse_movement.move(output[0], output[1])
        # Drawing and documenting performance stat
        # ref: https://github.com/gauravshelangia/computer-pointer-controller/blob/master/src/main.py
        # ref source code: https://knowledge.udacity.com/questions/257795
        inf_time_message = "Face Detection Inference time: {:.3f} ms.".format(
            fd_dur_time * 1000)
        #
        if POSE_CHECKED:
            cv2.putText(
                frame,
                "Head Pose Estimation Inference time: {:.3f} ms.".format(
                    hp_dur_time * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (0, 255, 0), 2)
            cv2.putText(img_frame, inf_time_message, (0, 15),
                        cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255), 2)
        vout.write(img_frame)
        if box_count % 10 == 0:
            print("Inference time = ", int(time.time() - infer_time_start))
            print('Box count {} and duration {}'.format(box_count, duration))
        if args.out_dir:
            final_infer_time = time.time() - infer_time_start
            with open(os.path.join(args.out_dir, 'stats.txt'), 'w') as marking:
                marking.write(str(round(final_infer_time, 1)) + '\n')
                marking.write(str(box_count) + '\n')

    if args.out_dir:
        with open(os.path.join(args.out_dir, 'stats.txt'), 'a') as marking:
            marking.write(str(round(model_load_time)) + '\n')

    # Clean all models
    fd_model.clean()
    hp_model.clean()
    fl_model.clean()
    ge_model.clean()
    # release cv2 cap
    cap.release()
    cv2.destroyAllWindows()
    # release all resulting ouputs writer
    vout.release()
    if args.save_output == 'yes':
        vout_fd.release()
        vout_hp.release()
        vout_fl.release()
        vout_ge.release()
def main():
    # create log file
    log.basicConfig(filename='logs/cpc.log', level=log.INFO, format='%(asctime)s %(message)s')

    # Parse the argument
    args = parse_arguments().parse_args()

    print('Input arguments:')
    for key, value in vars(args).items():
        print('\t{}: {}'.format(key, value))
    print('')

    # list used to handle model load and inference time
    m_fd_load_time = []
    m_hpe_load_time = []
    m_fd_infer_time = []
    m_hpe_infer_time = []
    m_fld_infer_time = []
    m_ge_infer_time = []
    m_fld_load_time = []
    m_ge_load_time = []

    if args.input == 'CAM':
        input_feeder = InputFeeder("cam")
    else:
        # get the input value
        input_stream = args.input
        if not os.path.isfile(input_stream):
            log.error("Provided input video file doesn't exist/video path is wrong!")
            exit(1)
        # load the video file
        input_feeder = InputFeeder("video", input_stream)

    # get model path
    head_face_detection_model_name = args.face_detection
    head_pose_estimation_model_name = args.head_pose_estimation
    face_landmarks_detection_model_name = args.facial_landmarks_detection
    gaze_estimation_model_name = args.gaze_estimation

    # mouse controller
    mouse_controller = MouseController(precision='medium', speed='fast')

    # load the required models
    m_fd_load_start_time = time.time()
    # create and load face_detection model
    face_detection = FaceDetectionModel(model_name=head_face_detection_model_name, device=args.device,
                                        probs_threshold=args.prob_threshold)
    face_detection.load_model()
    m_fd_load_time.append(round(time.time() - m_fd_load_start_time, 5))
    log.debug("Time taken to load Face detection model took {} seconds.".format(m_fd_load_time))

    # create and load head_pose estimation model
    m_hpe_load_start_time = time.time()
    head_pose_estimation = HeadPoseEstimationModel(model_name=head_pose_estimation_model_name, device=args.device)
    head_pose_estimation.load_model()
    m_hpe_load_time.append(round(time.time() - m_hpe_load_start_time, 5))
    log.debug("Time taken to load head pose estimation model took {} seconds.".format(m_hpe_load_time))

    # create and load face landmarks detection model
    m_fld_load_start_time = time.time()
    face_landmark_detection = FacialLandmarksDetectionModel(model_name=face_landmarks_detection_model_name,
                                                            device=args.device)
    face_landmark_detection.load_model()
    m_fld_load_time.append(round(time.time() - m_fld_load_start_time, 5))
    log.debug("Time taken to load face landmark detection model took {} seconds.".format(m_fld_load_time))

    # create and load face landmarks detection model
    m_ge_load_start_time = time.time()
    gaze_estimation = GazeEstimationModel(model_name=gaze_estimation_model_name, device=args.device)
    gaze_estimation.load_model()
    m_ge_load_time.append(round(time.time() - m_ge_load_start_time, 5))
    log.debug("Time taken to load gaze estimation model took {} seconds.".format(m_ge_load_time))

    # load the image data
    input_feeder.load_data()
    frame_count = 0
    threshold_frame = 5

    log.info("Video stream to perform gaze estimation is started!.")
    for flag, frame in input_feeder.next_batch():
        if not flag:
            break
        # to handle better control with frame processing
        if frame_count % threshold_frame == 0:
            key_pressed = cv2.waitKey(60)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            if key_pressed == 27:
                break
            # invoke face detection prediction
            m_fd_infer_start_time = time.time()
            detected_face_image, detected_box = face_detection.predict_face_detection(frame, args.visualization_fd)
            m_fd_infer_end_time = time.time()
            m_fd_infer_time.append(m_fd_infer_end_time - m_fd_infer_start_time)

            # invoke head pose estimation prediction
            head_pose_estimation_output, frame = head_pose_estimation.predict_head_pose_estimation(frame,
                                                                                                   detected_face_image,
                                                                                                   args.visualization_hpe)
            m_hpe_infer_end_time = time.time()
            m_hpe_infer_time.append(m_hpe_infer_end_time - m_fd_infer_end_time)

            # invoke face landmark detection prediction
            left_eye_image, right_eye_image, = face_landmark_detection.predict_facial_landmarks_detection(
                detected_face_image, args.visualization_fld)
            m_fld_infer_end_time = time.time()
            m_fld_infer_time.append(m_fld_infer_end_time - m_hpe_infer_end_time)

            # invoke gaze estimation prediction
            mouse_coordinate, predicted_gaze_output = gaze_estimation.predict_gaze_estimation(left_eye_image,
                                                                                              right_eye_image,
                                                                                              head_pose_estimation_output)
            m_ge_infer_end_time = time.time()
            m_ge_infer_time.append(m_ge_infer_end_time - m_fld_infer_end_time)

            if args.visualization_ge:
                # get the output from face landmark detection
                outputs = face_landmark_detection.get_outputs()
                # get back the bounding box
                height = detected_face_image.shape[0]
                width = detected_face_image.shape[1]
                left_eye_x = int(outputs[0] * width + detected_box[0])
                left_eye_y = int(outputs[1] * height + detected_box[1])
                right_eye_x = int(outputs[2] * width + detected_box[0])
                right_eye_y = int(outputs[3] * height + detected_box[1])
                eye_bounding_box = [left_eye_x, left_eye_y, right_eye_x, right_eye_y]
                gaze_estimation.draw_gaze_estimation(eye_bounding_box, predicted_gaze_output, frame)

            # show the results
            cv2.imshow('ComputerPointer', frame)
            mouse_controller.move(mouse_coordinate[0], mouse_coordinate[1])
        frame_count = frame_count + 1

    log.info("Completed gaze estimation for the provided video!.")
    log.info("Mean time taken to run Face detection inference took {} seconds.".format(statistics.mean(m_fd_infer_time)))
    log.info(
        "Mean time taken to run Head pose estimation inference took {} seconds.".format(statistics.mean(m_hpe_infer_time)))
    log.info("Mean time taken to run Face Landmark detection inference took {} seconds.".format(
        statistics.mean(m_fld_infer_time)))
    log.info("Mean time taken to run Gaze estimation inference took {} seconds.".format(statistics.mean(m_ge_infer_time)))
    # to perform model inference analysis
    # analyze_model_inference_time(m_fd_infer_time, m_hpe_infer_time, m_fld_infer_time, m_ge_infer_time, "FP32")
    # clean up resources
    input_feeder.close()
    cv2.destroyAllWindows()
Exemple #13
0
def main():
    args = build_argparser().parse_args()
    previewFlags = args.previewFlags

    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None
    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)

    modelPathDict = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            logger.error("Unable to find specified " + fileNameKey +
                         " xml file")
            exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()
    start = time.time()
    fdm.load_model()
    fdmload_time = time.time() - start
    load_time_message = "Loading time for Face Detection Model: {:.3f}ms".format(
        fdmload_time * 1000)
    print(load_time_message)
    start = time.time()
    fldm.load_model()
    fldmload_time = time.time() - start
    load_time_message = "Loading time for Facial Landmark Model: {:.3f}ms".format(
        fldmload_time * 1000)
    print(load_time_message)
    start = time.time()
    hpem.load_model()
    hpemload_time = time.time() - start
    load_time_message = "Loading time for Head Pose Estimation Model: {:.3f}ms".format(
        hpemload_time * 1000)
    print(load_time_message)
    start = time.time()
    gem.load_model()
    gemload_time = time.time() - start
    load_time_message = "Loading time for Gaze Estimation Model: {:.3f}ms".format(
        gemload_time * 1000)
    print(load_time_message)
    total_load_time = gemload_time + fdmload_time + hpemload_time + fldmload_time
    load_time_message = "Loading time for all the Models: {:.3f}ms".format(
        total_load_time * 1000)
    print(load_time_message)

    frame_count = 0
    inf_start = time.time()
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())
        total_inf_time = time.time() - inf_start
        inf_time_message = "Total Inference Time: {:.3f}s".format(
            total_inf_time)
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

        if (not len(previewFlags) == 0):
            preview_frame = frame.copy()
            if 'fd' in previewFlags:
                #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                preview_frame = croppedFace
            if 'fld' in previewFlags:
                cv2.rectangle(croppedFace,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(croppedFace,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace

            if 'hp' in previewFlags:
                cv2.putText(
                    preview_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in previewFlags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],
                            eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],
                            eye_coords[1][0]:eye_coords[1][2]] = re
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace

            cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500)))
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
    print(inf_time_message)
    fps = frame_count / total_inf_time
    fps_message = "Total FPS: {:.3f} fps".format(fps)
    print(fps_message)
Exemple #14
0
def main():
    args = build_argparser().parse_args()

    logger = logging.getLogger()
    # video file
    inputFilePath = args.input
    inputFeeder = None
    # check to see if the user wants to use a video or camera feed
    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)

    # get model files from the command line
    modelPathDict = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    # check if all files are accessible and correct
    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            logger.error("Unable to find specified " + fileNameKey +
                         " xml file")
            exit(1)

    # initializing the 4 models
    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    # loading models
    inputFeeder.load_data()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    # starting frame by frame inference
    frame_count = 0
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        # getting the cropped face from the Face Detecction model
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue

        # getting the head post estimation output
        hp_out = hpem.predict(croppedFace.copy())

        # getting the coordinated for the facial landmarks using the cropped image from the FacialRecognition model as input
        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        # getting the mouse coordinated by feeding the outputs from landmark and head-pose as inputs to the gaze detection
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

        # moving the mouse to the new position
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("Video ended, exiting...")
    cv2.destroyAllWindows()
    inputFeeder.close()
def start_infer(args):
    face_model = args.facemodel
    gaze_model = args.gazemodel
    head_model = args.headmodel
    landmarks_model = args.landmarksmodel

    device = args.device
    extensions = args.cpu_extension
    input_type = args.inputtype.lower()
    input_file = args.inputfile
    threshold = args.threshold
    benchmark = args.benchmark
    preview = args.preview

    feed = None
    key_pressed = None

    if input_type == "cam":
        feed = InputFeeder(input_type="cam")
    else:
        if not os.path.isfile(input_file):
            log.error("cannot find file {}".format(input_file))
            exit(1)

        feed = InputFeeder(input_type="video", input_file=input_file)

    face_network = FaceDetectionModel(model_name=face_model,
                                      device=device,
                                      threshold=threshold,
                                      extensions=extensions)

    head_network = HeadPoseModel(model_name=head_model,
                                 device=device,
                                 threshold=threshold,
                                 extensions=extensions)

    landmarks_network = FacialLandmarksModel(model_name=landmarks_model,
                                             device=device,
                                             threshold=threshold,
                                             extensions=extensions)

    gaze_network = GazeModel(model_name=gaze_model,
                             device=device,
                             threshold=threshold,
                             extensions=extensions)

    mouse_control = MouseController("medium", "fast")

    face_network.load_model()
    head_network.load_model()
    landmarks_network.load_model()
    gaze_network.load_model()

    feed.load_data()

    try:
        for flag, frame in feed.next_batch():
            if not flag:
                break

            if not benchmark:
                key_pressed = cv2.waitKey(60)

            face_output, cropped_face_frame = face_network.predict([frame])
            head_output, cropped_face_frame = head_network.predict(
                [cropped_face_frame])
            landmarks_output, cropped_eyes = landmarks_network.predict(
                [cropped_face_frame])
            mouse_coords, gaze_output = gaze_network.predict(
                [head_output, cropped_eyes[0], cropped_eyes[1]])

            # disable preview and mouse control while benchmarking
            # to make it more accurate
            if not benchmark:
                # Input user from preview argument
                if preview:
                    nframe = draw_output(
                        cropped_face_frame,
                        head_output,
                        landmarks_output,
                        gaze_output,
                    )
                    cv2.imshow("preview", nframe)

                # added pyautogui.FAILSAFE = False to mouse controller class
                # to prevent PyAutoGUI fail-safe messeges when mouse reaches the screen edge
                # mouse_control.move(mouse_coords[0], mouse_coords[1])
                mouse_control.move(mouse_coords[0], mouse_coords[1])

                if key_pressed == 27:
                    break
    except Exception as e:
        log.error(
            "error while predicting input source, more details as below:\n{}".
            format(e))

    # save benchmarks values to output directory
    if benchmark:
        face_network.print_benchmark()
        head_network.print_benchmark()
        landmarks_network.print_benchmark()
        gaze_network.print_benchmark()

    cv2.destroyAllWindows()
    feed.close()
Exemple #16
0
def main():

    # Grab command line args
    args = build_argparser().parse_args()
    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None
    inference_time = None

    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)
    #else:
    #	if not os.path.isfile(inputFilePath):
    #       logger.error("Unable to find specified image file")
    #       exit(1)
    #   inputFeeder = InputFeeder("image",inputFilePath)

    # Initialize variables with the input arguments
    modelPathDict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarksDetectionModel': args.FacialLandmarksDetectionModel,
        'GazeEstimationModel': args.gazeEstimationModel,
        'HeadPoseEstimationModel': args.HeadPoseEstimationModel
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            logger.error("Unable to find specified " + fileNameKey +
                         " xml file")
            exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    flm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpe = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                  args.device, args.cpu_extension)
    mc = MouseController('high', 'fast')

    inputFeeder.load_data()

    # Load Models and generate load times

    start_time = time.time()
    fdm.load_model()
    logger.error("Face detection model loaded: time: {:.3f} ms".format(
        (time.time() - start_time) * 1000))
    first_mark = time.time()
    flm.load_model()
    logger.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format(
            (time.time() - first_mark) * 1000))
    second_mark = time.time()
    hpe.load_model()
    logger.error("Head pose estimation model loaded: time: {:.3f} ms".format(
        (time.time() - second_mark) * 1000))
    third_mark = time.time()
    gem.load_model()
    logger.error("Gaze estimation model loaded: time: {:.3f} ms".format(
        (time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger.error("Total loading time: time: {:.3f} ms".format(load_total_time *
                                                              1000))
    logger.error("Required models have been loaded..")

    frame_count = 0
    start_inf_time = time.time()
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (600, 800)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])

        hp_out = hpe.predict(croppedFace.copy())
        left_eye, right_eye, eye_coords = flm.predict(croppedFace.copy())
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

    inference_time = round(time.time() - start_inf_time, 1)
    total_frames = int(frame_count)
    fps = int(frame_count) / (inference_time)
    logger.error("count {} seconds".format(frame_count))
    logger.error("total inference time {} seconds".format(inference_time))
    logger.error("total frames {} frames".format(frame_count))
    logger.error("fps {} frame/second".format(fps))

    with open(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'RunReport.txt'), 'w') as R:
        R.write('Load Time: ' + str(load_total_time) + '\n')
        R.write('Inference Time :' + str(inference_time) + '\n')
        R.write('total frames processed' + str(total_frames) + '\n')
        R.write('fps: ' + str(fps) + '\n')
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
    atexit.register(profile.print_stats)
Exemple #17
0
def main():

    # Grabing command line args
    args = build_argparser().parse_args()
    # Getting Input File Path
    inputFilePath = args.input
    # For Visualization
    visual_flag = args.visualization_flag
    # Initialize inputfeeder
    inputFeeder = None
    
    # Handle video file or CAM (like webcam)
    if args.input =="CAM":
            inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video",args.input)

    
    # Now define model path dictionary for all 04 intel pre trained models
    modelPathDict = {'FaceDetectionModel':args.face_detection_model, 'FacialLandmarksDetectionModel':args.facial_landmark_model, 
    'GazeEstimationModel':args.gaze_estimation_model, 'HeadPoseEstimationModel':args.head_pose_model}
    
    # Check model XML file
    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified "+fileNameKey+" xml file")
            sys.exit(1)
    
    # Defining Intel Pre Trained Models Objects
    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension)
    
    # Determining Precision and Speed for mouse controller 
    mc = MouseController('medium','fast')
    
    # Loading Input Feeder
    inputFeeder.load_data()
    
    # Loading our four pre trained models and calculate the total models loading time
    # This will help us to find different model time for different models precison like F32,F16 & INT8
    
    start_time_1= time.time()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()
    total_model_load_time= (time.time()-start_time_1)
    print("Total Model Load Time for All our Intel Pre Trained Models is (in seconds): {:.3f}".format(total_model_load_time))
    # Above print statement will give total model load time for our 04 models for different precisions as well
    
    
    frame_count = 0
    start_time = time.time()
    
    # Start Loop till break through input feeder
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count+=1
        if frame_count%5==0:
            cv2.imshow('video',cv2.resize(frame,(450,450)))
    
        key = cv2.waitKey(60)
        # Extracting face detection features
        croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold)
        if type(croppedFace)==int:
            log.info("Unable to detect the face.")
            if key==27:
                break
            continue
        
        # Head position detection
        hp_out = hpem.predict(croppedFace.copy())
        
        # Landmarks detection (left_eye, right_eye, eyes coordinates)
        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())
        
        # Mouse coordinates and gaze vector Detection
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)
        
        # Creating variables for visualisation
        # Extracting four face coordinates for rectangle (xmin,ymin,xmax,ymax) 
        x_minimum= face_coords[0]
        y_minimum=face_coords[1]
        x_maximum=face_coords[2]
        y_maximum=face_coords[3]
        
        # Take eye surrounding area
        eye_surrounding_area=10
        
        # Now extracting few features from eye coordinates
        # Extracting four coordinates of left eye from eye coordinates
        l_l1= eye_coords[0][0]
        l_l2=eye_coords[0][1]
        l_l3=eye_coords[0][2]
        l_l4=eye_coords[0][3]
        
        # Extracting four coordinates of left eye from eye coordinates
        r_r1=eye_coords[1][0]
        r_r2=eye_coords[1][1]
        r_r3=eye_coords[1][2]
        r_r4=eye_coords[1][3]
        
        # Extracting pose angle, pitch and roll from head pose output
        pose_angle= hp_out[0]
        pitch=hp_out[1]
        roll=hp_out[2]
            
        # Visualizing face, landmarks, head pose and gaze
        if (not len(visual_flag)==0):
            preview_frame = frame.copy()
            if 'fd' in visual_flag:
                # Drawing a rectangle with our four face coordiantes (xmin,ymin,xmax,ymax)
                cv2.rectangle(preview_frame, (x_minimum, y_minimum), (x_maximum, y_maximum), (20,20,150), 3)
                
            if 'fld' in visual_flag:
                # Drawing a rectangle for each eyes with the help of eye coordinates and eye surrounding area
                # Left Eye
                cv2.rectangle(preview_frame, (l_l1-eye_surrounding_area, l_l2-eye_surrounding_area), (l_l3+eye_surrounding_area, l_l4+eye_surrounding_area), (60,255,0), 2)
                # Right Eye
                cv2.rectangle(preview_frame, (r_r1-eye_surrounding_area, r_r2-eye_surrounding_area), (r_r3+eye_surrounding_area, r_r4+eye_surrounding_area), (60,255,0), 2)
                
            if 'hp' in visual_flag:
                # We have extracted pose angle, pitch and roll from head pose output, now we put text on preview_frame
                cv2.putText(preview_frame, "Pose Angles:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(pose_angle, pitch, roll), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 60), 1)
                
                
            if 'ge' in visual_flag:
                # Calculating coordinates for left eye to obtain left eye center
                le_x= (l_l1 + l_l3)/2
                le_y= (l_l2 + l_l4)/2
                # Calculating coordinates for right eye to obtain right eye center
                re_x= (r_r1 + r_r3)/2
                re_y= (r_r2 + r_r4)/2
                # Calculating left eye center
                le_center= int(x_minimum + le_x), int(y_minimum + le_y)
                # Calculating right eye center
                re_center= int(x_minimum + re_x), int(y_minimum + re_y)
                # Now put both eyes center in a list                
                eyes_center = [le_center, re_center ]
                # Extracting left eye x and y coordinates from eyes_center
                le_center_x = int(eyes_center[0][0])
                le_center_y = int(eyes_center[0][1])
                # Extracting right eye x and y coordinates from eyes_center
                re_center_x = int(eyes_center[1][0])
                re_center_y = int(eyes_center[1][1])
                # Extracting x and y (first and second) value from gaze_vector
                g_x, g_y = gaze_vector[0:2]
                
                # With the help of above parameters, draw arrowed lines for gaze on left and right eyes
                cv2.arrowedLine(preview_frame, (le_center_x, le_center_y), (le_center_x + int(g_x * 100), le_center_y + int(-g_y * 100)), (0,50,160), 1)
                cv2.arrowedLine(preview_frame, (re_center_x, re_center_y), (re_center_x + int(g_x * 100), re_center_y + int(-g_y * 100)), (0,50,160), 1)
                
            
            cv2.imshow("visualization",cv2.resize(preview_frame,(450,450)))
        
        if frame_count%5==0:
            mc.move(new_mouse_coord[0],new_mouse_coord[1])    
        if key==27:
            break
    log.info("VideoStream has been ended")
    cv2.destroyAllWindows()
    inputFeeder.close()
    
    # Calculating Inference time and frame per seconds
    total_time = time.time() - start_time
    total_inference_time=total_time
    fps=frame_count/total_inference_time
    print("Inference time: {:.3f}".format(total_inference_time))
    print("FPS: {}".format(fps))
Exemple #18
0
def main():

    args = build_argparser().parse_args()
    inputFilePath = args.input
    inputFeeder = None

    if args.input == "CAM":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video", args.input)

    modelPathDict = {
        'FaceDetectionModel': args.face_detection_model,
        'FacialLandmarksDetectionModel': args.facial_landmark_model,
        'GazeEstimationModel': args.gaze_estimation_model,
        'HeadPoseEstimationModel': args.head_pose_model
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified " + fileNameKey + " xml file")
            sys.exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    start_time_1 = time.time()
    inputFeeder.load_data()

    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()
    total_model_load_time = (time.time() - start_time_1)
    print("Model Load Time: {:.3f}".format(total_model_load_time))

    frame_count = 0
    start_time = time.time()

    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (450, 450)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            log.info("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    log.info("VideoStream has ended.")
    cv2.destroyAllWindows()
    inputFeeder.close()

    total_time = time.time() - start_time
    total_inference_time = total_time
    fps = frame_count / total_inference_time
    print("Inference Time: {:.3f}".format(total_inference_time))
    print("FPS: {}".format(fps))
class Inferencer:
    def __init__(self,
                 device='CPU',
                 mouse_con=False,
                 face_dec=None,
                 fac_land=None,
                 head_pose=None,
                 gaze=None,
                 show_video=False,
                 save_video=False):
        '''
        all models should be put in here 
        '''
        if face_dec and fac_land and head_pose and gaze:
            self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel(
                face_dec, device=device), FacialLandmarksDetection(
                    fac_land, device=device), Head_Pose_Estimation(
                        head_pose,
                        device=device), Gaze_Estimation(gaze, device=device)
            self.face_dec.load_model()
            self.fac_land.load_model()
            self.head_pose.load_model()
            self.gaze.load_model()
        else:
            raise ValueError('Missing Arguments')

        if mouse_con:
            self.mouse_con = MouseController("low", "fast")

        self.show_video, self.save_video = show_video, save_video

    def __call__(
        self,
        input_type=None,
        input_file=None,
    ):
        self.run(input_type=input_type, input_file=input_file)

    def run(
        self,
        input_type=None,
        input_file=None,
    ):
        if input_type and input_file:
            self.input_ = InputFeeder(input_type, input_file)
            self.input_.load_data()
            if self.save_video:
                out = cv2.VideoWriter(
                    'output.mp4', 0x00000021, 30,
                    (int(self.input_.cap.get(3)), int(self.input_.cap.get(4))))
        try:
            fc_dec_inf_time = 0
            landmark_inf_time = 0
            pose_inf_time = 0
            gaze_inf_time = 0
            frame_counter = 0
            while True:
                # Read the next frame
                try:
                    frame = next(self.input_.next_batch())
                    frame_counter += 1
                except StopIteration:
                    break

                key_pressed = cv2.waitKey(60)

                # face detection
                start = time.time()
                out_frame, boxes = self.face_dec.predict(frame,
                                                         display_output=True)
                fc_dec_inf_time += (time.time() - start)

                #for each box
                for box in boxes:
                    face = out_frame[box[1]:box[3], box[0]:box[2]]

                    start = time.time()
                    out_frame, left_eye_point, right_eye_point = self.fac_land.predict(
                        out_frame, face, box, display_output=True)
                    landmark_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, headpose_angels = self.head_pose.predict(
                        out_frame, face, box, display_output=True)
                    pose_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, gazevector = self.gaze.predict(
                        out_frame,
                        face,
                        box,
                        left_eye_point,
                        right_eye_point,
                        headpose_angels,
                        display_output=True)
                    gaze_inf_time += (time.time() - start)

                    if self.show_video:
                        cv2.imshow('im', out_frame)

                    if self.save_video:
                        out.write(out_frame)

                    if self.mouse_con:
                        self.mouse_con.move(gazevector[0], gazevector[1])

                    time.sleep(1)

                    #consider only first detected face in the frame
                    break

                # Break if escape key pressed
                if key_pressed == 27:
                    break

            if self.save_video:
                out.release()
            self.input_.close()
            cv2.destroyAllWindows()
            print(
                'average inference time for face detection model is :- {:2f}ms'
                .format((fc_dec_inf_time / frame_counter) * 1000))
            print(
                'average inference time for facial landmark model is :- {:2f}ms'
                .format((landmark_inf_time / frame_counter) * 1000))
            print(
                'average inference time for head pose estimation model is :- {:2f}ms'
                .format((pose_inf_time / frame_counter) * 1000))
            print(
                'average inference time for gaze estimation model is :- {:2f}ms'
                .format((gaze_inf_time / frame_counter) * 1000))
        except Exception as ex:
            logging.exception("Error in inference: " + str(ex))
Exemple #20
0
def main():
    """
    """

    # Grab command line args
    args = build_argparser().parse_args()

    input_src = args.input
    device = args.device
    extension = args.cpu_extension
    prob_threshold = args.prob_threshold

    face_detection_model = args.facedetectionmodel
    head_pose_model = args.headposemodel
    landmarks_model = args.facelandmarksnmodel
    gaze_estimation_model = args.gazeestimationmodel

    # Create log object set for console output and set log level
    log_obj = log.getLogger()
    log_obj.setLevel(LOGLEVEL)

    console_handler = log.StreamHandler()
    console_handler.setLevel(LOGLEVEL)
    log_obj.addHandler(console_handler)

    # Create detection objects
    face_detection_obj = FaceDetectionModel(face_detection_model, device,
                                            extension)
    head_pose_obj = HeadPoseModel(head_pose_model, device, extension)
    landmarks_obj = LandmarksModel(landmarks_model, device, extension)
    gaze_estimation_obj = GazeEstimationModel(gaze_estimation_model, device,
                                              extension)

    # Create mouse controller object
    mouse_controller = MouseController('medium', 'fast')
    # Place mouse at the center of the screen
    mouse_controller.init_position()
    log_obj.info("[Info]: Place mouse at the center of the screen")

    # Place holder for total inferencing time
    total_inference_time = 0

    # Load models and get the model loading times
    start_time = time.time()
    face_detection_obj.load_model()
    end_time = time.time()
    face_detection_loading_time = end_time - start_time

    start_time = time.time()
    head_pose_obj.load_model()
    end_time = time.time()
    head_pose_loading_time = end_time - start_time

    start_time = time.time()
    landmarks_obj.load_model()
    end_time = time.time()
    landmarks_detection_loading_time = end_time - start_time

    start_time = time.time()
    gaze_estimation_obj.load_model()
    end_time = time.time()
    gaze_estimation_loading_time = end_time - start_time

    # Configure input video source
    if input_src.lower() == 'cam':
        input_channel = InputFeeder(input_type='cam')
    elif not os.path.exists(input_src):
        log.error("Video file not found! Exiting....")
        exit(1)
    else:
        input_channel = InputFeeder(input_type='video', input_file=input_src)
        log_obj.info("[Info]: Opening video file ...")

    input_channel.load_data()
    video_width = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_channel.cap.get(cv2.CAP_PROP_FPS))

    frame_counter = 0
    total_face_inf_time = 0
    total_head_inf_time = 0
    total_lanmarks_inf_time = 0
    total_gaze_inf_time = 0
    frame_processing_time = 0

    # Process each frame
    try:
        for frame in input_channel.next_batch():
            frame_processing_start_time = time.time()

            frame_counter = frame_counter + 1
            key = cv2.waitKey(60)

            # Use face detection to find cropped face and provide face coordinates
            cropped_face, face_coords, face_inference_time = face_detection_obj.predict(
                frame, prob_threshold)
            total_face_inf_time = total_face_inf_time + face_inference_time

            #  Now use cropped face for head pose detection
            head_pose_estimate, head_inference_time = head_pose_obj.predict(
                cropped_face, prob_threshold)
            total_head_inf_time = total_head_inf_time + head_inference_time

            #  Now use cropped face for landmarks detection
            cropped_left_eye, cropped_right_eye, eyes_coords, converted_landmarks, landmarks_inference_time = landmarks_obj.predict(
                cropped_face, prob_threshold)
            total_lanmarks_inf_time = total_lanmarks_inf_time + landmarks_inference_time

            #  Finally gaze estimation
            gaze_vector, gaze_estimate_time = gaze_estimation_obj.predict(
                cropped_left_eye, cropped_right_eye, head_pose_estimate)
            total_gaze_inf_time = total_gaze_inf_time + gaze_estimate_time

            # Move the mouse
            #mouse_controller.move(gaze_vector[0], gaze_vector[1])

            # Show size-reduced frame for visual comparison

            # Check potential visualize flags: 'F', 'H', 'L', 'G'
            # If flag exist, process image to show inference results
            if args.visualize is not None:

                visualize_flag = str(args.visualize)

                # Draw bounding box around detected face
                if 'F' in visualize_flag:
                    cv2.rectangle(frame,
                                  (face_coords[0][0], face_coords[0][1]),
                                  (face_coords[0][2], face_coords[0][3]),
                                  (0, 255, 0), 2)

                # Show head pose parameters
                if 'H' in visualize_flag:
                    cv2.putText(
                        frame,
                        "Head pose: yaw: {:.3f}, pitch: {:.3f}, roll: {:.3f}".
                        format(head_pose_estimate[0], head_pose_estimate[1],
                               head_pose_estimate[2]), (10, 20),
                        cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5)

                # Draw dots on detected facial landmarks
                if 'L' in visualize_flag:
                    cv2.circle(frame,
                               (converted_landmarks[0] + face_coords[0][0],
                                converted_landmarks[1] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[2] + face_coords[0][0],
                                converted_landmarks[3] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[4] + face_coords[0][0],
                                converted_landmarks[5] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[6] + face_coords[0][0],
                                converted_landmarks[7] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[8] + face_coords[0][0],
                                converted_landmarks[9] + face_coords[0][1]),
                               10, (0, 255, 0), 5)

                # Display gaze parameters
                if 'G' in visualize_flag:
                    cv2.putText(
                        frame,
                        "Gaze estimate: x: {:.3f}, y: {:.3f}, z: {:.3f}".
                        format(gaze_vector[0], gaze_vector[1], gaze_vector[2]),
                        (10, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5)

            resized_frame = cv2.resize(frame, (640, 360))
            cv2.imshow('frame', resized_frame)

            if frame_counter % 4 == 0:
                mouse_controller.move(gaze_vector[0], gaze_vector[1])

            frame_processing_time = frame_processing_time + (
                time.time() - frame_processing_start_time) * 1000

            if key == 27:
                break

    except Exception as e:
        #traceback.print_exc()
        if 'shape' in str(e):
            log_obj.info("Video feed finished")
        else:
            log_obj.error("[ERROR]: " + str(e))
        pass

    # All done, cleaning up
    cv2.destroyAllWindows()
    input_channel.close()

    # Print out statistics
    log_obj.info("[Info]: Video source FPS: " + str(fps))
    log_obj.info("[Info]: Total frame count: " + str(frame_counter))
    log_obj.info("")
    log_obj.info("[Info]: Face detection model loading time: {:.3f} ms".format(
        face_detection_loading_time * 1000))
    log_obj.info("[Info]: Head pose model loading time: {:.3f} ms".format(
        head_pose_loading_time * 1000))
    log_obj.info(
        "[Info]: Facial landmarks detection model loading time: {:.3f} ms".
        format(landmarks_detection_loading_time * 1000))
    log_obj.info(
        "[Info]: Gaze estimation model loading time: {:.3f} ms".format(
            gaze_estimation_loading_time * 1000))
    log_obj.info("")
    log_obj.info(
        "[Info]: Average  per frame total processing time : {:.3f} ms".format(
            frame_processing_time / frame_counter))
    log_obj.info("[Info]: Average face inferencing  time: {:.3f} ms".format(
        total_face_inf_time / frame_counter))
    log_obj.info(
        "[Info]: Average head pose  inferencing  time: {:.3f} ms".format(
            total_head_inf_time / frame_counter))
    log_obj.info(
        "[Info]: Average facial landmarks inferencing  time: {:.3f} ms".format(
            total_lanmarks_inf_time / frame_counter))
    log_obj.info("[Info]: Average gaze estimate  time: {:.3f} ms".format(
        total_gaze_inf_time / frame_counter))
Exemple #21
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args):
    face_detection_model_file = args.faceDetectionModel
    facial_landmarks_detection_model_file = args.facialLandmarksModel
    head_pose_estimation_model_file = args.headPoseModel
    gaze_estimation_model_file = args.gazeModel

    video_file = args.input
    device_name = args.device
    cpu_extension = args.cpu_extension
    prob_threshold = args.prob_threshold
    preview_flag = args.preview_flag

    output_path = args.output_path
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    mouse_control = MouseController("low", "fast")

    try:
        logging.info("*********** Model Load Time ***************")
        start_model_load_time = time.time()

        start_time = time.time()
        face_detection_model = FaceDetectionModel(face_detection_model_file,
                                                  device_name, cpu_extension)
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        facial_landmarks_detection_model = FacialLandmarksDetectionModel(
            facial_landmarks_detection_model_file, device_name, cpu_extension)
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        head_pose_estimation_model = HeadPoseEstimationModel(
            head_pose_estimation_model_file, device_name, cpu_extension)
        logging.info("Head Pose Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file,
                                                    device_name, cpu_extension)
        logging.info("Gaze Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        total_model_load_time = time.time() - start_model_load_time
        logging.info("*********** Model Load Completed ***********")
    except Exception as e:
        logging.error("ERROR in model loading: " + str(e))
        sys.exit(1)

    feeder = InputFeeder('video', video_file)
    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.fps() / 10), (1920, 1080), True)

    start_inference_time = 0
    frame_count = 0
    face_detect_infer_time = 0
    facial_landmarks_infer_time = 0
    head_pose_infer_time = 0
    gaze_infer_time = 0

    while True:
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break

        key_pressed = cv2.waitKey(60)
        frame_count += 1

        ## Face Detecton Model
        image = face_detection_model.preprocess_input(frame)

        start_time = time.time()
        outputs = face_detection_model.predict(image)
        face_detect_infer_time += (time.time() - start_time)
        out_frame, faces = face_detection_model.preprocess_output(
            outputs, frame, preview_flag, prob_threshold)

        for face in faces:
            crop_image = frame[face[1]:face[3], face[0]:face[2]]

            ## Facial Landmarks Detecton Model
            image = facial_landmarks_detection_model.preprocess_input(
                crop_image)

            start_time = time.time()
            outputs = facial_landmarks_detection_model.predict(image)
            facial_landmarks_infer_time += (time.time() - start_time)
            out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output(
                outputs, out_frame, face, preview_flag)

            ## Head Pose Estimation Model
            image = head_pose_estimation_model.preprocess_input(crop_image)

            start_time = time.time()
            outputs = head_pose_estimation_model.predict(image)
            head_pose_infer_time += (time.time() - start_time)
            out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output(
                outputs, out_frame, preview_flag)

            ## Gaze Estimation Model
            out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(
                out_frame, crop_image, left_eye_point, right_eye_point)

            start_time = time.time()
            outputs = gaze_estimation_model.predict(left_eye, right_eye,
                                                    headpose_angels_list)
            gaze_infer_time += (time.time() - start_time)
            out_frame, gazevector = gaze_estimation_model.preprocess_output(
                outputs, out_frame, face, left_eye_point, right_eye_point,
                preview_flag)

            cv2.imshow("Computer Pointer Control", out_frame)
            out_video.write(out_frame)
            mouse_control.move(gazevector[0], gazevector[1])

        if key_pressed == 27:
            break

    if frame_count > 0:
        logging.info("*********** Model Inference Time ****************")
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * face_detect_infer_time / frame_count))
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * facial_landmarks_infer_time / frame_count))
        logging.info("Head Pose Detection Model: {:.1f} ms.".format(
            1000 * head_pose_infer_time / frame_count))
        logging.info("Gaze Detection Model: {:.1f} ms.".format(
            1000 * gaze_infer_time / frame_count))
        logging.info("*********** Model Inference Completed ***********")

    total_infer_time = time.time() - start_inference_time
    total_inference_time = round(total_infer_time, 1)
    fps = frame_count / total_inference_time

    with open(os.path.join(output_path, 'stats.txt'), 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logging.info("*********** Total Summary ****************")
    logging.info(f"Total Model Load Time: {total_model_load_time}")
    logging.info(f"Total Inference Time: {total_inference_time}")
    logging.info(f"FPS: {fps}")
    logging.info("*********** Total Summary ***********")
    logging.info("*********** ************************* ***********")

    feeder.close()
    cv2.destroyAllWindows()
def main():
    args = build_argparser().parse_args()

    preview_flags = args.preview_flags

    logger = logging.getLogger()
    input_path = args.input

    if input_path.lower() == 'cam':
        input_feed = InputFeeder('cam')
    else:
        if not os.path.isfile(input_path):
            logger.error('Unable to find specified video file')
            exit(1)
        file_extension = input_path.split(".")[-1]
        if (file_extension in ['jpg', 'jpeg', 'bmp']):
            input_feed = InputFeeder('image', input_path)
        elif (file_extension in ['avi', 'mp4']):
            input_feed = InputFeeder('video', input_path)
        else:
            logger.error(
                "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']"
            )
            exit(1)

    if sys.platform == "linux" or sys.platform == "linux2":
        #CODEC = 0x00000021
        CODEC = cv2.VideoWriter_fourcc(*"mp4v")
    elif sys.platform == "darwin":
        CODEC = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
    else:
        print("Unsupported OS.")
        exit(1)

    file_flag = False
    if args.output_file.lower() == 'y':
        file_flag = True
        out = cv2.VideoWriter('output.mp4', CODEC, 30,
                              (FRAME_WIDTH, FRAME_HEIGHT))

    modelPathDict = {
        'face_detect': args.face_detection_model,
        'face_landmark_regress': args.facial_landmark_model,
        'head_pose': args.head_pose_model,
        'gaze_estimate': args.gaze_estimation_model
    }

    for pathname in modelPathDict:
        if not os.path.isfile(modelPathDict[pathname]):
            logger.error('Unable to find specified ' + pathname + ' xml file')
            exit(1)

    #initializing models
    fdm = FaceDetectionModel(modelPathDict['face_detect'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['face_landmark_regress'], args.device,
        args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['head_pose'], args.device,
                                   args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['gaze_estimate'], args.device,
                              args.cpu_extension)

    #initializing mouse controller
    mouse_controller = MouseController('medium', 'fast')

    input_feed.load_data()

    #checking models
    fdm.check_model()
    fldm.check_model()
    hpem.check_model()
    gem.check_model()

    #loading models / creating executable network
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in input_feed.next_batch():
        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)
        """
        Sequence of model execution:-
        1. Predict from each model.
        2. Preprocess of outputs from each model.
        3. Send the processed output to the next model.

        Model Sequence:- 
                                -   Head Pose Estimation Model      -
        Face Detection Model <(First Head Pose and Then Facial Landmark)>Gaze Estimation Model 
                                -   Facial Landmark Detection Model -  
        """

        cropped_face, face_coords = fdm.preprocess_output(
            frame.copy(), fdm.predict(frame.copy()), args.prob_threshold)

        if type(cropped_face) == int:
            logger.error('Unable to detect the face.')
            if key == 27:
                break
            continue

        hp_out = hpem.preprocess_output(hpem.predict(cropped_face.copy()))

        left_eye, right_eye, eye_coords = fldm.preprocess_output(
            cropped_face.copy(), fldm.predict(cropped_face.copy()))

        new_mouse_coord, gaze_vector = gem.preprocess_output(
            gem.predict(left_eye, right_eye, hp_out), hp_out)

        if (not len(preview_flags) == 0) or file_flag:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                preview_frame = cv2.rectangle(preview_frame,
                                              (face_coords[0], face_coords[1]),
                                              (face_coords[2], face_coords[3]),
                                              (0, 0, 255), 3)
                cropped_face = preview_frame[face_coords[1]:face_coords[3],
                                             face_coords[0]:face_coords[2]]

            if 'fld' in preview_flags:
                cropped_face = cv2.rectangle(
                    cropped_face,
                    (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                    (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                    (0, 255, 0), 3)
                cropped_face = cv2.rectangle(
                    cropped_face,
                    (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                    (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                    (0, 255, 0), 3)

                preview_frame[face_coords[1]:face_coords[3],
                              face_coords[0]:face_coords[2]] = cropped_face

            if 'hp' in preview_flags:
                cv2.putText(
                    preview_frame,
                    'Pose Angles: yaw: {:.2f} | pitch: {:.2f} | roll: {:.2f}'.
                    format(hp_out[0], hp_out[1], hp_out[2]), (20, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

            if 'ge' in preview_flags:

                x, y = int(gaze_vector[0] * GAZE_ARROW_LENGTH), -int(
                    gaze_vector[1] * GAZE_ARROW_LENGTH)

                le_mid_x = int((eye_coords[0][0] + eye_coords[0][2]) / 2)
                le_mid_y = int((eye_coords[0][1] + eye_coords[0][3]) / 2)
                re_mid_x = int((eye_coords[1][0] + eye_coords[1][2]) / 2)
                re_mid_y = int((eye_coords[1][1] + eye_coords[1][3]) / 2)

                cv2.arrowedLine(cropped_face, (le_mid_x, le_mid_y),
                                ((le_mid_x + x), (le_mid_y + y)), (255, 0, 0),
                                GAZE_ARROW_WIDTH)
                cv2.arrowedLine(cropped_face, (re_mid_x, re_mid_y),
                                ((re_mid_x + x), (re_mid_y + y)), (255, 0, 0),
                                GAZE_ARROW_WIDTH)

                preview_frame[face_coords[1]:face_coords[3],
                              face_coords[0]:face_coords[2]] = cropped_face

            if (not len(preview_flags) == 0) and frame_count % 2 == 0:
                if args.zoomed:
                    cv2.imshow(
                        'Cropped Face',
                        cv2.resize(cropped_face, (FRAME_WIDTH, FRAME_HEIGHT)))
                else:
                    cv2.imshow(
                        'Preview',
                        cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT)))

            if file_flag:
                out.write(
                    cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT)))

        #move the mouse pointer
        try:
            mouse_controller.move(new_mouse_coord[0], new_mouse_coord[1])
        except pyautogui.FailSafeException:
            pass

        if frame_count % 2 == 0 and len(preview_flags) == 0:
            cv2.imshow('Video', cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT)))

        if key == 27:
            break

    logger.error('VideoStream ended.')
    if args.output_file.lower() == 'y':
        out.release()
    input_feed.close()
    cv2.destroyAllWindows()
Exemple #24
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger()

    if args.input_type == 'video' or args.input_type == 'image':
        extension = str(args.input).split('.')[1]
        feeder = InputFeeder(args.input_type, args.input)
    elif args.input_type == 'cam':
        feeder = InputFeeder(args.input_type)

    mc = MouseController("medium", "fast")
    feeder.load_data()

    face_model = FaceDetectionModel(args.facedetectionmodel, args.device,
                                    args.cpu_extension)
    face_model.check_model()

    landmark_model = Landmark_Model(args.facelandmarkmodel, args.device,
                                    args.cpu_extension)
    landmark_model.check_model()

    gaze_model = Gaze_Estimation_Model(args.gazeestimationmodel, args.device,
                                       args.cpu_extension)
    gaze_model.check_model()

    head_model = Head_Pose_Model(args.headposemodel, args.device,
                                 args.cpu_extension)
    head_model.check_model()

    face_model.load_model()
    logger.info("Face Detection Model Loaded...")

    landmark_model.load_model()
    logger.info("Landmark Detection Model Loaded...")

    head_model.load_model()
    logger.info("Head Pose Detection Model Loaded...")

    gaze_model.load_model()
    logger.info("Gaze Estimation Model Loaded...")

    logger.info('All Models are loaded\n\n')
    out = cv2.VideoWriter('output_video.mp4', 0x00000021, 30, (500, 500))

    frame_count = 0
    for ret, frame in feeder.next_batch():
        if not ret:
            break
            frame_count += 1

        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))
        key = cv2.waitKey(60)
        faceROI = None

        if True:
            faceROI, box = FaceDetectionModel.predict(frame.copy(),
                                                      args.prob_threshold)
            if faceROI is None:
                logger.error("Unable to detect the face.")
                if key == 27:
                    break
                continue

            (lefteye_x, lefteye_y), (
                righteye_x, righteye_y
            ), eye_coords, left_eye, right_eye = FaceLandmarkModel.predict(
                faceROI.copy(), EYE_ROI=10)
            head_position = HeadPoseModel.predict(faceROI.copy())
            new_mouse_coord, gaze_vector = EyeGazeModel.predict(
                left_eye.copy(), right_eye.copy(), head_position)

            if (not len(previewFlags) == 0):
                preview_frame = frame.copy()
                if 'fd' in previewFlags:
                    #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                    preview_frame = croppedFace
                if 'fld' in previewFlags:
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                        (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                        (0, 255, 0), 3)
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                        (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                        (0, 255, 0), 3)
                    #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                if 'hp' in previewFlags:
                    cv2.putText(
                        preview_frame,
                        "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                        format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                        cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
                if 'ge' in previewFlags:
                    x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                            12), 160
                    le = cv2.line(left_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    re = cv2.line(right_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    croppedFace[eye_coords[0][1]:eye_coords[0][3],
                                eye_coords[0][0]:eye_coords[0][2]] = le
                    croppedFace[eye_coords[1][1]:eye_coords[1][3],
                                eye_coords[1][0]:eye_coords[1][2]] = re
                    #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                #cv2.imshow("visualization",cv2.resize(preview_frame,(500,500)))
                out.write(frame)

            if frame_count % 5 == 0:
                mc.move(new_mouse_coord[0], new_mouse_coord[1])
            if key == 27:
                break

    logger.error("VideoStream ended...")
    out.release()
    cv2.destroyAllWindows()
    inputFeeder.close()
Exemple #25
0
def main():

    args = build_argparser().parse_args()
    Flags = args.Flags

    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None

    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)

    Dir = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    for fileKey in Dir.keys():
        if not os.path.isfile(Dir[fileKey]):
            logger.error("Unable to find specified " + fileKey + " xml file")
            exit(1)

    Fd = FaceDetectionModel(Dir['FaceDetectionModel'], args.device,
                            args.cpu_extension)
    Fl = FacialLandmarksDetectionModel(Dir['FacialLandmarksDetectionModel'],
                                       args.device, args.cpu_extension)
    Ge = GazeEstimationModel(Dir['GazeEstimationModel'], args.device,
                             args.cpu_extension)
    Hp = HeadPoseEstimationModel(Dir['HeadPoseEstimationModel'], args.device,
                                 args.cpu_extension)
    Mc = MouseController('medium', 'fast')

    ## Loading part starts here
    #start_model_load_time=time.time()

    inputFeeder.load_data()
    Fd.load_model()
    Fl.load_model()
    Hp.load_model()
    Ge.load_model()

    #total_model_load_time = time.time() - start_model_load_time

    count = 0
    #start_inference_time=time.time()

    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        count += 1

        if count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = Fd.predict(frame.copy(),
                                              args.prob_threshold)

        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = Hp.predict(croppedFace.copy())

        l_eye, r_eye, eye_coords = Fl.predict(
            croppedFace.copy())  # Main funcn's doing all our task

        new_coord, gaze_vector = Ge.predict(l_eye, r_eye, hp_out)

        #total_time=time.time()-start_inference_time
        #total_inference_time=round(total_time, 1)

        #fps=count/total_inference_time

        ## Now comes the importance of all the flags

        if (not len(Flags) == 0):
            new_frame = frame.copy()
            if 'fd' in Flags:
                new_frame = croppedFace

            if 'fl' in Flags:
                cv2.rectangle(croppedFace,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(croppedFace,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)

            if 'hp' in Flags:
                cv2.putText(
                    new_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)

            if 'ge' in Flags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(l_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                re = cv2.line(r_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],
                            eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],
                            eye_coords[1][0]:eye_coords[1][2]] = re

            cv2.imshow("visualization", cv2.resize(new_frame, (500, 500)))

        if count % 5 == 0:
            Mc.move(new_coord[0], new_coord[1])
        if key == 27:
            break

    logger.error("Video Done...")
    # print(total_inference_time)
    # print(fps)
    #print(total_model_load_time)

    cv2.destroyAllWindows()
    inputFeeder.close()
def main_benchmark(args):
    feed = InputFeeder(input_type=args.it, input_file=args.i)

    face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p))
    start_time = time.time()
    face_model.load_model()
    face_load_model_time = time.time() - start_time

    landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c)
    start_time = time.time()
    landmarks_model.load_model()
    landmarks_model_time = time.time() - start_time

    headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c)
    start_time = time.time()
    headpose_model.load_model()
    headpose_model_time = time.time() - start_time

    gaze_model = GazeEstimationModel(args.gem, args.d, args.c)
    start_time = time.time()
    gaze_model.load_model()
    gaze_model_time = time.time() - start_time

    feed.load_data()
    for batch in feed.next_batch():
        try:
            start_time = time.time()
            cropped_face, coords, face_time_prediction = face_model.predict(
                batch)
            cv2.rectangle(batch, (coords[0], coords[1]),
                          (coords[2], coords[3]), (255, 0, 0), 2)
            io_face_model_time = time.time() - start_time

            start_time = time.time()
            left_eye, right_eye, eyes_coords, landmarks_time_prediction = landmarks_model.predict(
                cropped_face)
            io_landmarks_model_time = time.time() - start_time

            start_time = time.time()
            head_pose_angles, headpose_time_prediction = headpose_model.predict(
                cropped_face)
            io_head_pose_model_time = time.time() - start_time

            start_time = time.time()
            x, y, z, gaze_time_prediction = gaze_model.predict(
                left_eye, right_eye, head_pose_angles, cropped_face,
                eyes_coords)
            io_gaze_model_time = time.time() - start_time

            print("Graphing loading time...")
            graph_loading_time(face_load_model_time, landmarks_model_time,
                               headpose_model_time, gaze_model_time, args.bm)
            print("Graphing io processing time...")
            graph_io_processing_time(io_face_model_time,
                                     io_landmarks_model_time,
                                     io_head_pose_model_time,
                                     io_gaze_model_time, args.bm)
            print("Graphing inference time...")
            graph_model_inference_time(face_time_prediction,
                                       landmarks_time_prediction,
                                       headpose_time_prediction,
                                       gaze_time_prediction, args.bm)
            print("Done")

            break

        except:
            print("Frame without prediction. Error: ", sys.exc_info()[0])
            log.error(sys.exc_info()[0])
    feed.close()
Exemple #27
0
def main():
    args = build_argparser().parse_args()
    logging.basicConfig(filename=args.output+'/app.log', filemode='w')

    print("Begin: Try not to move mouse with your hands")
    mc = MouseController("low", "fast")
    if args.input == "cam":
        frames = InputFeeder("cam")
    else:
        frames = InputFeeder("video", args.input)
    cap = frames.load_data()

    if args.display:
        initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        out_video = cv2.VideoWriter(os.path.join(args.output, 'output_video.mp4'), cv2.VideoWriter_fourcc('m','p','4','v'), fps, (initial_w, initial_h))


    face_model = FaceDetectionModel(args.face_model, args.output, args.device)
    pose_model = HeadPoseEstimationModel(args.pose_model, args.output, args.device)
    landmarks_model = FacialLandmarksDetectionModel(args.landmarks_model, args.output, args.device)
    gaze_model = GazeEstimationModel(args.gaze_model, args.output, args.device)
    avg_out = 0
    avg = 0
    tmlt_face_avg = 0
    tinpt_face_avg = 0
    tint_face_avg = 0
    toutt_face_avg = 0

    tmlt_pose_avg = 0
    tinpt_pose_avg = 0
    tint_pose_avg = 0
    toutt_pose_avg = 0

    tmlt_landmarks_avg = 0
    tinpt_landmarks_avg = 0
    tint_landmarks_avg = 0
    toutt_landmarks_avg = 0

    tmlt_gaze_avg = 0
    tinpt_gaze_avg = 0
    tint_gaze_avg = 0
    toutt_gaze_avg = 0
    logging.info("Frames starting")
    for frame in frames.next_batch():
        if frame is None:
            logging.error("Frame: " + frame + "failed")
            continue
        output_image = frame.copy()
        cropped_faces, tmlt_face, tinpt_face, tint_face, toutt_face = face_model.predict(frame)
        try:
            largest_face = cropped_faces[0]
            for face in cropped_faces:
                if largest_face.size < face.size:
                    largest_face = face
            pose, tmlt_pose, tinpt_pose, tint_pose, toutt_pose = pose_model.predict(largest_face)
            landmarks, tmlt_landmarks, tinpt_landmarks, tint_landmarks, toutt_landmarks = landmarks_model.predict(largest_face)
            gaze_vector, tmlt_gaze, tinpt_gaze, tint_gaze, toutt_gaze = gaze_model.predict(largest_face, landmarks, pose)
        except Exception as e:
            logging.error("Model inference failed: " + str(e))
            # print(e)
            continue
        if args.display:
            output_image, xmin, ymin = face_model.draw_crop_outputs(output_image, args.display)
            output_image = gaze_model.display_eye_boxes(output_image, landmarks, xmin, ymin, args.display)
            out_video.write(output_image)
        cv2.imshow("output_image", output_image)
        cv2.waitKey(15)
        face_model.coords = []
        tmlt_face_avg += tmlt_face
        tinpt_face_avg += tinpt_face
        tint_face_avg += tint_face
        toutt_face_avg += toutt_face

        tmlt_pose_avg += tmlt_pose
        tinpt_pose_avg += tinpt_pose
        tint_pose_avg += tint_pose
        toutt_pose_avg += toutt_pose

        tmlt_landmarks_avg += tmlt_landmarks
        tinpt_landmarks_avg+= tinpt_landmarks
        tint_landmarks_avg += tint_landmarks
        toutt_landmarks_avg += toutt_landmarks

        if gaze_vector is None:
            avg_out += 1
            continue
        tmlt_gaze_avg += tmlt_gaze
        tinpt_gaze_avg += tinpt_gaze
        tint_gaze_avg += tint_gaze
        toutt_gaze_avg += toutt_gaze
        avg += 1
        gaze_vector_norm = gaze_vector / np.linalg.norm(gaze_vector)
        try:
            mc.move(gaze_vector_norm[0], gaze_vector_norm[1])
        except Exception as e:
            logging.error("Gaze failed: " + str(e))
            # print(e)
            continue

    file_name = "stats_"+args.precision+".txt"
    save_path = os.path.join(os.getcwd(), args.output)
    f = open(os.path.join(save_path, file_name), "w")
    f.write("Benchmark Start:"+"\n\n")
    f.write("Face Detection Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_face_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_face_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_face_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_face_avg/avg)+"\n\n")

    f.write("Head Pose Estimation Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_pose_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_pose_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_pose_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_pose_avg/avg)+"\n\n")

    f.write("Facial Landmarks Detection Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_landmarks_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_landmarks_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_landmarks_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_landmarks_avg/avg)+"\n\n")

    f.write("Gaze Estimation Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Input Time:"+str(tinpt_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Inference Time:"+str(tint_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Output Time:"+str(toutt_gaze_avg/(avg-avg_out))+"\n\n")
    f.write("Benchmark end"+"\n")
    f.close()

    print("Thank you, Goodbye")
    frames.close()
Exemple #28
0
def main():
    # get command line args
    args = build_argparser().parse_args()

    logger = log.getLogger()

    type_input = args.input

    if type_input == "CAM":
        inputFeeder = InputFeeder("cam")
    else:
        inputFeeder = InputFeeder("video", args.input)

    inputFeeder.load_data()

    mc = MouseController("medium", "fast")

    fdm = FaceDetectionModel(model_name=args.face_dectection_model,
                             device=args.device,
                             extensions=args.cpu_extension,
                             threshold=args.prob_threshold)
    fldm = FacialLandmarksModel(model_name=args.face_landmarks_model,
                                device=args.device,
                                extensions=args.cpu_extension)
    gem = GazeEstimationModel(model_name=args.gaze_estimation_model,
                              device=args.device,
                              extensions=args.cpu_extension)
    hpem = HeadPoseEstimationModel(model_name=args.head_pose_model,
                                   device=args.device,
                                   extensions=args.cpu_extension)
    data_capture = {}

    start_time = time.time()
    fdm.load_model()
    fdm_load_time = time.time()
    fldm.load_model()
    fldm_load_time = time.time()
    hpem.load_model()
    hpem_load_time = time.time()
    gem.load_model()
    gem_load_time = time.time()

    data_capture['FaceDetectionModel_loadtime'] = round(
        (fdm_load_time - start_time) * 1000, 3)
    data_capture['FacialLandmarksModel_loadtime'] = round(
        (fldm_load_time - fdm_load_time) * 1000, 3)
    data_capture['HeadPoseEstimationModel_loadtime'] = round(
        (hpem_load_time - fldm_load_time) * 1000, 3)
    data_capture['GazeEstimationModel_loadtime'] = round(
        (gem_load_time - hpem_load_time) * 1000, 3)

    for flag, frame in inputFeeder.next_batch():
        if not flag:
            break

        pressedKey = cv2.waitKey(60)

        start_infer_time = time.time()  # time to start inference
        face_coords, face_img = fdm.predict(frame)
        fdm_infertime = time.time()

        if face_coords == 0:  # if face not detected
            continue

        hpem_out = hpem.predict(face_img)
        hpem_infertime = time.time()

        left_eye, right_eye, eye_coord = fldm.predict(face_img)
        fldm_infertime = time.time()

        if left_eye.all() == 0 or right_eye.all(
        ) == 0:  # if eye are not detected
            continue

        mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hpem_out)
        gem_infertime = time.time()

        if args.preview:
            output_boxes(frame, (face_coords[0], face_coords[1]),
                         (face_coords[2], face_coords[3]))

            bound_boxes(frame, eye_coord, 45, 25, face_coords[0],
                        face_coords[1])

            text = "Yaw: {:.2f}, Pitch: {:+.2f}, Roll: {:.2f}".format(
                hpem_out[0], hpem_out[1], hpem_out[2])

            output_text(frame, text, (100, 100))

            h = frame.shape[0]
            w = frame.shape[1]

            center_of_face = (h / 2, w / 2, 0)

            draw_axes(frame,
                      center_of_face,
                      hpem_out[0],
                      hpem_out[1],
                      hpem_out[2],
                      scale=50,
                      focal_length=950)

            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        cv2.imshow('video', cv2.resize(frame, (500, 500)))
        mc.move(mouse_coord[0], mouse_coord[1])

        if pressedKey == 27:
            break

    data_capture['FaceDetectionModel_Inferencetime'] = round(
        (fdm_infertime - start_infer_time) * 1000, 3)
    data_capture['HeadPoseEstimationModel_Inferencetime'] = round(
        (hpem_infertime - fdm_infertime) * 1000, 3)
    data_capture['FacialLandmarksModel_Inferencetime'] = round(
        (fldm_infertime - hpem_infertime) * 1000, 3)
    data_capture['GazeEstimationModel_Inferencetime'] = round(
        (gem_infertime - fldm_infertime) * 1000, 3)

    total_time = round((time.time() - start_infer_time) * 1000, 3)
    data_capture['Total_time'] = total_time

    df = pd.DataFrame.from_dict(data_capture,
                                orient='index',
                                columns=['time(msecs)'])
    df.to_csv("results.csv")

    logger.error("Video has ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()