Esempio n. 1
0
def main():

    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarksModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    bbox_flag = args.bbox_flag
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    face_detection_model = Face_detection(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    facial_landmarks_detection_model = Landmark_Detection(
        model_path_dict['FacialLandmarksModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = Head_pose(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = Gaze_estimation(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    is_benchmarking = False

    if not is_benchmarking:
        mouse_controller = MouseController('medium', 'fast')

    start_model_load_time = time.time()
    face_detection_model.load_model()
    facial_landmarks_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1920, 1080), True)

    frame_count = 0
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_coords, image_copy = face_detection_model.predict(frame)

            if type(image_copy) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_coords = facial_landmarks_detection_model.predict(
                image_copy)
            hp_output = head_pose_estimation_model.predict(image_copy)
            mouse_coords, gaze_coords = gaze_estimation_model.predict(
                left_eye, right_eye, hp_output)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        image = cv2.resize(frame, (500, 500))

        if not len(bbox_flag) == 0:
            bbox_frame = draw_bbox(frame, bbox_flag, image_copy, left_eye,
                                   right_eye, face_coords, eye_coords,
                                   hp_output, gaze_coords)
            image = np.hstack(
                (cv2.resize(frame,
                            (500, 500)), cv2.resize(bbox_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(frame)

        if frame_count % 5 == 0 and not is_benchmarking:
            mouse_controller.move(mouse_coords[0], mouse_coords[1])

        if key == 27:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    try:
        os.mkdir(output_path)
    except OSError as error:
        logger.error(error)

    with open(output_path + 'stats.txt', 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))

    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
def run_app(options):
    metrics_builder = MetricsBuilder(options.precision)

    with elapsed_timer() as et:
        fdmodel = FaceDetectionModel(options.fdmodel, options.device,
                                     options.prob_threshold, options.is_visual,
                                     options.extension)
        fdmodel.load_model()
        fdmodel_loadtime = et()
        metrics_builder.face_detection.load_time = fdmodel_loadtime
        logging.info(f'face detection loading time taken: {fdmodel_loadtime}')

    with elapsed_timer() as et:
        ldmodel = LandmarkDetectionModel(options.ldmodel, options.device,
                                         options.prob_threshold,
                                         options.is_visual, options.extension)
        ldmodel.load_model()
        ldmodel_loadtime = et()
        metrics_builder.landmarks_detection.load_time = ldmodel_loadtime
        logging.info(
            f'Landmark detection loading time taken: {ldmodel_loadtime}')

    with elapsed_timer() as et:
        hpemodel = HeadPoseEstimationModel(options.hpemodel, options.device,
                                           options.prob_threshold,
                                           options.is_visual,
                                           options.extension)
        hpemodel.load_model()
        hpemodel_loadtime = et()
        metrics_builder.head_pose_estimation.load_time = hpemodel_loadtime
        logging.info(
            f'Head Position Estimation loading time taken: {hpemodel_loadtime}'
        )

    with elapsed_timer() as et:
        gemodel = GazeEstimationModel(options.gemodel, options.device,
                                      options.prob_threshold,
                                      options.is_visual, options.extension)
        gemodel.load_model()
        gemodel_loadtime = et()
        metrics_builder.gaze_estimation.load_time = gemodel_loadtime
        logging.info(
            f'Gazer Estimation loading time taken: {gemodel_loadtime}')

    try:

        # Get and open video capture
        if options.is_cam:
            feeder = InputFeeder('cam')
        else:
            feeder = InputFeeder('video', options.input)
        feeder.load_data()

        initial_w, initial_h = feeder.get_size()
        fps = feeder.get_fps()

        fdmodel.set_inputsize(initial_w, initial_h)
        ldmodel.set_inputsize(initial_w, initial_h)
        hpemodel.set_inputsize(initial_w, initial_h)
        gemodel.set_inputsize(initial_w, initial_h)

        frame_count = 0

        mouse_controller = MouseController("low", "fast")

        window_name = 'computer pointer controller'
        cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
        cv2.resizeWindow(window_name, initial_w, initial_h)

        out_path = str(pathlib.Path('./results/output_video.mp4'))
        print(out_path)
        out_video = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'avc1'),
                                    fps, (initial_w, initial_h), True)

        for frame in feeder.next_batch():

            if frame is None:
                break

            # exit video for escape key
            key_pressed = cv2.waitKey(60)
            if key_pressed == 27:
                break

            frame_count += 1

            # detect face
            p_frame = fdmodel.preprocess_input(frame)
            with elapsed_timer() as et:
                fdmodel_output = fdmodel.predict(p_frame)
                metrics_builder.face_detection.add_infer_time(et())
            out_frame, fboxes = fdmodel.preprocess_output(
                fdmodel_output, frame)

            # Take first face - (xmin,ymin,xmax,ymax)
            fbox = fboxes[0]

            # landmarks estimation
            # face = np.asarray(Image.fromarray(frame).crop(fbox))
            xmin, ymin, xmax, ymax = fbox
            face = frame[ymin:ymax, xmin:xmax]

            p_frame = ldmodel.preprocess_input(face)
            with elapsed_timer() as et:
                lmoutput = ldmodel.predict(p_frame)
                metrics_builder.landmarks_detection.add_infer_time(et())
            out_frame, left_eye_point, right_eye_point = ldmodel.preprocess_output(
                lmoutput, fbox, out_frame)

            # head pose estimation
            p_frame = hpemodel.preprocess_input(face)
            with elapsed_timer() as et:
                hpoutput = hpemodel.predict(p_frame)
                metrics_builder.head_pose_estimation.add_infer_time(et())
            out_frame, headpose_angels = hpemodel.preprocess_output(
                hpoutput, out_frame, face, fbox)
            #
            # # gaze  estimation
            out_frame, left_eye, right_eye = gemodel.preprocess_input(
                out_frame, face, left_eye_point, right_eye_point)
            with elapsed_timer() as et:
                geoutput = gemodel.predict(headpose_angels, left_eye,
                                           right_eye)
                metrics_builder.gaze_estimation.add_infer_time(et())
            out_frame, gazevector = gemodel.preprocess_output(
                geoutput, out_frame, fbox, left_eye_point, right_eye_point)
            # show frame
            if options.is_show_frame:
                cv2.imshow(window_name, out_frame)

            # mouse controller
            if options.is_move_pointer:
                x, y, _ = gazevector
                mouse_controller.move(x, y)

            out_video.write(out_frame)

        # performance metrics
        metrics_builder.save_metrics(frame_count)

        feeder.close()
        cv2.destroyAllWindows()

    except Exception as e:
        logging.error("Fatal error in main loop", exc_info=True)
def main():
    args = build_argparser().parse_args()

    # initialize variables with the input arguments for easy access
    fdm = args.face_detection_model
    ldm = args.facial_landmarks_detection_model
    hpem = args.head_pose_estimation_model
    gem = args.gaze_estimation_model
    output_flags = args.output_flags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    cpu_extension = args.cpu_extension

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            log.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    # initialize model
    face_detection_model = FaceDetect(fdm, device_name, cpu_extension,
                                      prob_threshold)
    landmark_detection_model = FacialLandmarks(ldm, device_name, cpu_extension,
                                               prob_threshold)
    head_pose_estimation_model = HeadPose(hpem, device_name, cpu_extension,
                                          prob_threshold)
    gaze_estimation_model = GazeEstimation(gem, device_name, cpu_extension,
                                           prob_threshold)

    mouse_controller = MouseController('medium', 'fast')

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()  #load face detection model
    log.info("Face Detection Model Loaded...")
    FDMT = time.time() - start_model_load_time
    start1 = time.time()
    landmark_detection_model.load_model()  #load_landmark_detection_model
    log.info("landmark_estimation Model Loaded...")
    LDMT = time.time() - start1
    start2 = time.time()
    head_pose_estimation_model.load_model()  #load_head_pose_estimation_model
    log.info("Head pose estimation model Loaded...")
    hpem = time.time() - start2
    start3 = time.time()
    gaze_estimation_model.load_model()  #load_gaze_estimation_model
    log.info("Gaze_estimation model loaded..")
    gem = time.time() - start3
    total_time = time.time() - start_model_load_time

    feeder.load_data()

    #check for output flags
    if (len(output_flags) != 0):
        for flag in output_flags:
            if not flag in ['fdm', 'lrm', 'hp', 'gze']:
                log.error("Flag '" + flag + "' is not a valid preview flag.")
                sys.exit(1)

    frame_count = 0
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1
        #if frame_count%5==0:
        #cv2.imshow('video',cv2.resize(frame,(500,500)))
        key = cv2.waitKey(60)
        try:

            image, fc = face_detection_model.predict(frame,
                                                     args.prob_threshold)
            #print (fc)
            #print (image.shape)
            #face_cords1=face_cords[0]
            #face_c = face_cords1.astype(np.int8)
            #print (image.shape)
            if type(image) == int:
                log.warning("Unable to detect the face")
                if key == 27:
                    break
                continue
            #for cord in face_c:
            #face1=cord.astype(np.int32)
            # cord = (xmin,ymin,xmax,ymax)
            # get face landmarks
            # crop face from frame
            #face = image[face_cords1[1]:face_cords1[3],face_cords1[0]:face_cords1[2]]
            #print (face.shape)
            if 'fdm' in output_flags:
                #cv2.rectangle(frame,(fc[0],fc[1]),(fc[2],fc[4]),3)
                cv2.putText(frame, "face detected", (10, 140),
                            cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4)
            # predicting using landmark detection model
            left_eye_image, right_eye_image, eye_coords = landmark_detection_model.predict(
                image)  #using the output of face detection model
            print(eye_coords)
            eye_buffer = 10

            if 'lrm' in output_flags:
                view_eye_rectangle(eye_coords, eye_buffer, image)

            print(left_eye_image.shape)
            print(right_eye_image.shape)

            #predicting using head_pose_estimation model
            pose_output = head_pose_estimation_model.predict(image)
            yaw = pose_output[0]
            pitch = pose_output[1]
            roll = pose_output[2]

            if "hp" in output_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw:{:.2f},  pitch:{:.2f},  roll:{:.2f}".
                    format(yaw, pitch, roll), (10, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 0), 4)

            mouse_coord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            if "gze" in output_flags:
                cv2.putText(
                    frame,
                    "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        gaze_vector[0], gaze_vector[1], gaze_vector[2]),
                    (10, 90), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4)

        except Exception as e:
            log.warning("Could not predict using model " + str(e) +
                        " for frame " + str(frame_count))
            continue

        #image = cv2.resize(frame, (500, 500))
        total_inference_time = time.time() - start_inference_time

        cv2.imshow("Visualization", cv2.resize(frame, (500, 500)))
        #out_video.write(preview_frame)
        #moving_mouse_controller
        if frame_count % 5 == 0:
            mouse_controller.move(mouse_coord[0], -1 * mouse_coord[1])
        if key == 27:
            break
    log.error("VideoStream ended...")
    print("total_model_load time is {:} ms".format(1000 * total_time /
                                                   frame_count))
    print("fps is {:}".format(int(feeder.get_fps())))
    print("total inference time is{:} ms".format(1000 * total_inference_time /
                                                 frame_count))
    print("fdmt loading time is{:} ms".format(1000 * FDMT / frame_count))
    print("ldmt loading time is{:} ms".format(1000 * LDMT / frame_count))
    print("hpem loading tiem{:} ms".format(1000 * hpem / frame_count))
    print("gzem loading time{:} ms".format(1000 * hpem / frame_count))
    cv2.destroyAllWindows()
    feeder.close()
Esempio n. 4
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    is_benchmarking = False
    total_score = 0

    # initialize variables with the input arguments for easy access
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'LandmarkRegressionModel': args.landmarkRegressionModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }
    preview_flags = args.previewFlags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    # add path for exercise video data
    exercise_video_path = '../bin/demo.mp4'
    exercise_gaze_path = '../bin/demo.csv'

    exercise_gaze_df = pd.read_csv(exercise_gaze_path)

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    exercise_feeder = InputFeeder(input_type='video',
                                  input_file=exercise_video_path)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    # instantiate model
    face_detection_model = FaceDetectionModel(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    landmark_detection_model = LandmarkDetectionModel(
        model_path_dict['LandmarkRegressionModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = GazeEstimationModel(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()
    landmark_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()
    exercise_feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1000, 500), True)

    frame_count = 0
    gaze_vectors = []
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        # flip the image to make it similar to video image
        frame = np.flip(frame, 1)
        ex_ret, ex_frame = next(exercise_feeder.next_batch())

        if not ret:
            break

        # This will stop the cam when exercise video is over
        if len(exercise_gaze_df) <= len(gaze_vectors):
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_cords, cropped_image = face_detection_model.predict(frame)

            if type(cropped_image) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict(
                cropped_image)
            pose_output = head_pose_estimation_model.predict(cropped_image)
            mouse_cord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            gaze_vectors.append(gaze_vector)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        if not len(preview_flags) == 0:
            preview_frame = draw_preview(frame, 'ff', cropped_image,
                                         left_eye_image, right_eye_image,
                                         face_cords, eye_cords, pose_output,
                                         gaze_vector)
            cropped_image = np.hstack((cv2.resize(ex_frame, (500, 500)),
                                       cv2.resize(preview_frame, (500, 500))))

        instructor_gaze_vector = exercise_gaze_df.iloc[frame_count - 1].values
        score = cosine(instructor_gaze_vector, gaze_vector)
        if score > 0.1:
            total_score += 1

        # show score on output video
        cv2.putText(
            ex_frame,
            "Instructor Gaze Vector: {} ".format(instructor_gaze_vector),
            (40, 60), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2)
        cv2.putText(ex_frame, "User Gaze Vector: {}".format(gaze_vector),
                    (40, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2)
        cv2.putText(ex_frame, "Gaze Match Score : {}".format(total_score),
                    (40, 145), cv2.FONT_HERSHEY_COMPLEX, 1.5, (0, 0, 0), 2)
        ex_frame = cv2.rectangle(ex_frame, (20, 20), (1200, 160), (0, 0, 0), 2)

        image = np.hstack(
            (cv2.resize(ex_frame,
                        (500, 500)), cv2.resize(cropped_image, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if key == 0:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    if input_filename == "cam":
        filename = "cam.csv"
    else:
        filename = input_filename.split("/")[-1].split(".")[0] + ".csv"

    gaze_df = pd.DataFrame(gaze_vectors,
                           columns=['vector_x', 'vector_y', 'vector_z'])
    gaze_df.to_csv(filename, index=False)
    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))
    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
    """
Esempio n. 5
0
def infer_on_stream(args):
    
    start_model_load_time=time.time()
    
    #initiate and load models
    face_det_net = Face_Detection_Model(args.face_model)
    face_det_net.load_model()
    head_pose_net = Head_Pose_Model(args.head_model)
    head_pose_net.load_model()
    facial_landmarks_net = Facial_Landmarks_Model(args.landmarks_model)
    facial_landmarks_net.load_model()
    gaze_est_net = Gaze_Estimation_Model(args.gaze_model)
    gaze_est_net.load_model()
    total_model_load_time = time.time() - start_model_load_time
    
    #initiate stream
    counter=0
    start_inference_time=time.time()
    
    if args.input.lower()=="cam":
        frame_feeder = InputFeeder(input_type='cam')
        frame_feeder.load_data()
    else:
        frame_feeder = InputFeeder(input_type='video', input_file=args.input)
        frame_feeder.load_data()
    fps = frame_feeder.get_fps()
    log.info('Video started')
    
    #initiate mouse controller
    mouse_controller = MouseController('medium','fast')
    
    ## write output video in Winows
    out_video = cv2.VideoWriter('../output.mp4',cv2.VideoWriter_fourcc(*'avc1'),
                                fps,(frame_feeder.get_size()), True)
    
    ## write output video in Linux
    #out_video = cv2.VideoWriter('output.mp4',cv2.VideoWriter_fourcc(*'avc1'),
    #fps,(frame_feeder.get_size()))
    
    for flag,frame in frame_feeder.next_batch():
        if flag == True:             
            key = cv2.waitKey(60) 
            counter+=1
            coords, image, face = face_det_net.predict(frame)
            pose = head_pose_net.predict(face)
            land, left_eye_image, right_eye_image, eye_coords = facial_landmarks_net.predict(face)
            
            if left_eye_image.shape == (40, 40, 3):
                mouse_coords, gaze = gaze_est_net.predict(left_eye_image, right_eye_image, pose)
                
            mouse_controller.move(mouse_coords[0], mouse_coords[1])
            
            if args.visual.lower()=="yes":
                frame = draw_outputs(coords, eye_coords, pose, gaze, 
                                     mouse_coords[0], mouse_coords[1],
                                     image)
                cv2.imshow('video', frame)
                out_video.write(frame)
                cv2.imshow('video', frame)
            else:
                cv2.imshow('video', frame)
            if key == 27:
                break 
        else:
            log.info('Video ended')
            total_time=time.time()-start_inference_time
            total_inference_time=round(total_time, 1)
            f_ps=counter/total_inference_time
            log.info("Models load time {:.2f}.".format(total_model_load_time))
            log.info("Total inference time {:.2f}.".format(total_inference_time))
            log.info("Inference frames pre second {:.2f}.".format(f_ps))
            cv2.destroyAllWindows()
            frame_feeder.close()
            break
Esempio n. 6
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    is_benchmarking = False
    # initialize variables with the input arguments for easy access
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'LandmarkRegressionModel': args.landmarkRegressionModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }
    preview_flags = args.previewFlags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    # instantiate model
    face_detection_model = FaceDetectionModel(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    landmark_detection_model = LandmarkDetectionModel(
        model_path_dict['LandmarkRegressionModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = GazeEstimationModel(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()
    landmark_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1920, 1080), True)

    frame_count = 0
    gaze_vectors = []
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_cords, cropped_image = face_detection_model.predict(frame)

            if type(cropped_image) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict(
                cropped_image)
            pose_output = head_pose_estimation_model.predict(cropped_image)
            mouse_cord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            gaze_vectors.append(gaze_vector)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        image = cv2.resize(frame, (500, 500))

        if not len(preview_flags) == 0:
            preview_frame = draw_preview(frame, preview_flags, cropped_image,
                                         left_eye_image, right_eye_image,
                                         face_cords, eye_cords, pose_output,
                                         gaze_vector)
            image = np.hstack((cv2.resize(frame, (500, 500)),
                               cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(frame)

        if key == 27:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    gaze_df = pd.DataFrame(gaze_vectors,
                           columns=['vector_x', 'vector_y', 'vector_z'])
    gaze_df.to_csv("gaze_vectors_excercise_video.csv", index=False)
    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))

    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
    """