def run(
        self,
        input_type=None,
        input_file=None,
    ):
        if input_type and input_file:
            self.input_ = InputFeeder(input_type, input_file)
            self.input_.load_data()
            if self.save_video:
                out = cv2.VideoWriter(
                    'output.mp4', 0x00000021, 30,
                    (int(self.input_.cap.get(3)), int(self.input_.cap.get(4))))
        try:
            fc_dec_inf_time = 0
            landmark_inf_time = 0
            pose_inf_time = 0
            gaze_inf_time = 0
            frame_counter = 0
            while True:
                # Read the next frame
                try:
                    frame = next(self.input_.next_batch())
                    frame_counter += 1
                except StopIteration:
                    break

                key_pressed = cv2.waitKey(60)

                # face detection
                start = time.time()
                out_frame, boxes = self.face_dec.predict(frame,
                                                         display_output=True)
                fc_dec_inf_time += (time.time() - start)

                #for each box
                for box in boxes:
                    face = out_frame[box[1]:box[3], box[0]:box[2]]

                    start = time.time()
                    out_frame, left_eye_point, right_eye_point = self.fac_land.predict(
                        out_frame, face, box, display_output=True)
                    landmark_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, headpose_angels = self.head_pose.predict(
                        out_frame, face, box, display_output=True)
                    pose_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, gazevector = self.gaze.predict(
                        out_frame,
                        face,
                        box,
                        left_eye_point,
                        right_eye_point,
                        headpose_angels,
                        display_output=True)
                    gaze_inf_time += (time.time() - start)

                    if self.show_video:
                        cv2.imshow('im', out_frame)

                    if self.save_video:
                        out.write(out_frame)

                    if self.mouse_con:
                        self.mouse_con.move(gazevector[0], gazevector[1])

                    time.sleep(1)

                    #consider only first detected face in the frame
                    break

                # Break if escape key pressed
                if key_pressed == 27:
                    break

            if self.save_video:
                out.release()
            self.input_.close()
            cv2.destroyAllWindows()
            print(
                'average inference time for face detection model is :- {:2f}ms'
                .format((fc_dec_inf_time / frame_counter) * 1000))
            print(
                'average inference time for facial landmark model is :- {:2f}ms'
                .format((landmark_inf_time / frame_counter) * 1000))
            print(
                'average inference time for head pose estimation model is :- {:2f}ms'
                .format((pose_inf_time / frame_counter) * 1000))
            print(
                'average inference time for gaze estimation model is :- {:2f}ms'
                .format((gaze_inf_time / frame_counter) * 1000))
        except Exception as ex:
            logging.exception("Error in inference: " + str(ex))
Esempio n. 2
0
def main(args):
    #model=args.model
    fd_model = args.face
    flmd_model = args.landmarks
    hp_model = args.head
    ge_model = args.gaze
    device = args.device
    display_flag = args.display

    # Init and load models
    fd = FaceDetection(fd_model, device)
    logger.info("######## Model loading Time #######")
    start = time.time()
    fd.load_model()
    logger.info("Face Detection Model: {:.1f}ms".format(1000 *
                                                        (time.time() - start)))

    flmd = FacialLandMarksDetection(flmd_model, device)
    start = time.time()
    flmd.load_model()
    logger.info("Facial Landmarks Detection Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    hpe = HeadPoseEstimation(hp_model, device)
    start = time.time()
    hpe.load_model()
    logger.info("HeadPose Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    ge = GazeEstimation(ge_model, device)
    start = time.time()
    ge.load_model()
    logger.info("Gaze Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    # Mouse controller
    mc = MouseController("low", "fast")

    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    feed.load_data()

    frame_count = 0
    fd_inference_time = 0
    lm_inference_time = 0
    hp_inference_time = 0
    ge_inference_time = 0
    move_mouse = False

    for batch in feed.next_batch():
        frame_count += 1
        # Preprocessed output from face detection
        face_boxes, image, fd_time = fd.predict(batch, display_flag)
        fd_inference_time += fd_time

        for face in face_boxes:
            cropped_face = batch[face[1]:face[3], face[0]:face[2]]
            #print(f"Face boxe = {face}")
            # Get preprocessed result from landmarks
            image, left_eye, right_eye, lm_time = flmd.predict(
                image, cropped_face, face, display_flag)
            lm_inference_time += lm_time

            # Get preprocessed result from pose estimation
            image, headpose_angels, hp_time = hpe.predict(
                image, cropped_face, face, display_flag)
            hp_inference_time += hp_time

            # Get preprocessed result from Gaze estimation model
            image, gazevector, ge_time = ge.predict(image, cropped_face, face,
                                                    left_eye, right_eye,
                                                    headpose_angels,
                                                    display_flag)
            #cv2.imshow('Face', cropped_face)
            ge_inference_time += ge_time
            #print(f"Gaze vect {gazevector[0],gazevector[1]}")
            cv2.imshow('img', image)
            if (not move_mouse):
                mc.move(gazevector[0], gazevector[1])
            break

        if cv2.waitKey(1) & 0xFF == ord("k"):
            break
    if (frame_count > 0):
        logger.info("###### Models Inference time ######")
        logger.info(
            f"Face Detection inference time = {(fd_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Facial Landmarks Detection inference time = {(lm_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Headpose Estimation inference time = {(hp_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Gaze estimation inference time = {(ge_inference_time*1000)/frame_count} ms"
        )
    feed.close()
def main(args):
    start_model_load_time=time.time()

    # load model
    class_face_detection = ModelFaceDetection(args.model_face_detection, args.device, args.threshold)
    class_face_detection.load_model()

    class_head_pose_estimation = ModelHeadPoseEstimation(args.model_head_pose_estimation, args.device)
    class_head_pose_estimation.load_model()

    class_facial_landmarks_detection = ModelFacialLandmarksDetection(args.model_facial_landmarks_detection, args.device)
    class_facial_landmarks_detection.load_model()

    class_gaze_estimation = ModelGazeEstimation(args.model_gaze_estimation, args.device)
    class_gaze_estimation.load_model()

    total_model_load_time = time.time() - start_model_load_time

    # input image
    feed=InputFeeder(input_type='video', input_file=args.input_path)
    feed.load_data()

    # output
    initial_w, initial_h, initial_fps = feed.get_info()

    counter = 0
    start_inference_time = time.time()

    # debug
    #print("initial_w:{}, initial_h:{}, initial_fps:{}".format(initial_w, initial_h, initial_fps))

    #out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), initial_fps, (initial_w, initial_h), True)
    out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (initial_w, initial_h), True)

    class_face_detection.initial_size(initial_w, initial_h)

    #mc = MouseController(precision='low', speed='slow')
    mc = MouseController(precision='high', speed='fast')

    for flag, batch in feed.next_batch():
        if not flag:
            break

        counter += 1

        # debug
        #print("batch.shape:{}".format(batch.shape))
        # if batch is not None:

        # face_detection
        cropped_face = class_face_detection.predict(batch)

        # head_pose_estimation
        head_pose_angles = class_head_pose_estimation.predict(cropped_face)

        # debug
        #print("angle_y_fc:{}, angle_p_fc:{}, angle_r_fc:{}".format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2]))

        # facial_landmarks_detection
        left_eye_image, right_eye_image, left_eye_center, right_eye_center= class_facial_landmarks_detection.predict(cropped_face)

        # gaze_estimation
        x, y, gaze_vector = class_gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles)

        cv2.line(cropped_face, left_eye_center, (int(left_eye_center[0] + gaze_vector[0] * 100), int(left_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)
        cv2.line(cropped_face, right_eye_center, (int(right_eye_center[0] + gaze_vector[0] * 100), int(right_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)

        # output
        cv2.imshow('output', batch)
        cv2.waitKey(30)
        cv2.imwrite('output.jpg', batch);

        out_video.write(batch)

        # MouseController
        mc.move(x, y)

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = counter/total_inference_time

    print("total_model_load_time:{}, total_inference_time:{}, fps:{}".format(total_model_load_time, total_inference_time, fps))

    feed.close()
    cv2.destroyAllWindows()
Esempio n. 4
0
 def initialize_feed(self):
     self.feed = InputFeeder(self.args.input_type, self.args.input)
     self.feed.load_data()
def main():
    args = build_argparser().parse_args()
    previewFlags = args.previewFlags

    logger = logging.getLogger()
    inputFile = args.input
    inputFeeder = None

    if inputFile.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFile):
            logger.error("Unable to find input file")
            exit(1)

        inputFeeder = InputFeeder("video", inputFile)

    start_loading = time.time()

    mfd = Model_Face_Detection(args.facedetectionmodel, args.device,
                               args.cpu_extension)
    mfld = Model_Facial_Landmarks_Detection(args.faciallandmarkmodel,
                                            args.device, args.cpu_extension)
    mge = Model_Gaze_Estimation(args.gazeestimationmodel, args.device,
                                args.cpu_extension)
    mhpe = Model_Head_Pose_Estimation(args.headposemodel, args.device,
                                      args.cpu_extension)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()

    mfd.load_model()
    mfld.load_model()
    mge.load_model()
    mhpe.load_model()

    model_loading_time = time.time() - start_loading

    counter = 0
    frame_count = 0
    inference_time = 0
    start_inf_time = time.time()
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        if frame is not None:
            frame_count += 1
            if frame_count % 5 == 0:
                cv2.imshow('video', cv2.resize(frame, (500, 500)))
            key = cv2.waitKey(60)
            start_inference = time.time()
            croppedFace, face_coords = mfd.predict(frame.copy(),
                                                   args.prob_threshold)
            if type(croppedFace) == int:
                logger.error("No face detected.")
                if key == 27:
                    break
                continue

            hp_out = mhpe.predict(croppedFace.copy())

            left_eye, right_eye, eye_coords = mfld.predict(croppedFace.copy())

            new_mouse_coord, gaze_vector = mge.predict(left_eye, right_eye,
                                                       hp_out)

            stop_inference = time.time()
            inference_time = inference_time + stop_inference - start_inference
            counter += 1
            if (not len(previewFlags) == 0):
                preview_window = frame.copy()
                if 'fd' in previewFlags:
                    preview_window = croppedFace
                if 'fld' in previewFlags:
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                        (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                        (0, 255, 0), 3)
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                        (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                        (0, 255, 0), 3)
                if 'hp' in previewFlags:
                    cv2.putText(
                        preview_window,
                        "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                        format(hp_out[0], hp_out[1], hp_out[2]), (50, 50),
                        cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 1)
                if 'ge' in previewFlags:
                    x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                            12), 160
                    le = cv2.line(left_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    re = cv2.line(right_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    croppedFace[eye_coords[0][1]:eye_coords[0][3],
                                eye_coords[0][0]:eye_coords[0][2]] = le
                    croppedFace[eye_coords[1][1]:eye_coords[1][3],
                                eye_coords[1][0]:eye_coords[1][2]] = re

                cv2.imshow("visualization",
                           cv2.resize(preview_window, (500, 500)))
            if frame_count % 5 == 0:
                mc.move(new_mouse_coord[0], new_mouse_coord[1])
            if key == 27:
                break

    fps = frame_count / inference_time

    logger.error("Total loading time: " + str(model_loading_time) + " seconds")
    logger.error("total inference time {} seconds".format(inference_time))
    logger.error("Average inference time: " +
                 str(inference_time / frame_count) + " seconds")
    logger.error("{} fps".format(fps / 5))

    cv2.destroyAllWindows()
    inputFeeder.close()
def main(args):

    #init mouse controller class
    mouse_controller = MouseController('low', 'medium')

    log.debug('Init model classes')
    #init model classes
    face_Detect = FaceDetection(args.face_detection, args.device)
    land_Marks = FaceLandmarks(args.landmarks, args.device)
    head_PoseEstimat = headPoseEstimation(args.head_pose_estimation,
                                          args.device)
    gaze_Estimation = gazeEstimation(args.gaze_estimation, args.device)

    #init input feeder class
    feed = InputFeeder(input_type=args.input_feed, input_file=args.path_feed)

    log.info('load input source ...')
    #load data input source from either image, video, cam according to the parameters passed by the user or the default one (video)
    cap = feed.load_data()

    #load video save parameters
    feed.load_video_save_params(name_export_video='output_video.mp4')

    #get the Height and width from the input source
    initial_w, initial_h = feed.get_input_size()

    #Facedetection threshold prob from args
    THRESHOLD = args.prob_threshold

    log.info('Run models inferences ...')
    while (cap.isOpened()):

        for ret, frame in feed.next_batch():

            if ret == True:
                #flip image
                frame = utils.flip_image_vertical(frame)

                #copy unmodifed frame
                original_frame = np.copy(frame)

                #Set facedetecetion parameters
                face_Detect.set_params(frame, THRESHOLD, initial_w, initial_h)

                #Run facedetection inference
                confidence, data_face_detection_points = face_Detect.get_inference_outputs(
                )

                if confidence >= THRESHOLD:
                    #Crop main frame with face detection coordinates use to draw the rectangle
                    cropped_frame, cropped_h, cropped_w = utils.crop_frame(
                        frame, data_face_detection_points[1],
                        data_face_detection_points[3],
                        data_face_detection_points[0],
                        data_face_detection_points[2])

                    land_Marks.set_params(cropped_frame, cropped_h, cropped_w)

                    left_eye_center_points, right_eye_center_points, data_l_eye, data_r_eye, data_points_marks = land_Marks.get_inference_outputs(
                    )

                    #Use the x,y points from face detection to display visualisation at the right position
                    xomin = data_face_detection_points[0]
                    yomin = data_face_detection_points[1]

                    #Crop left eye from data generated by landmarks detection model
                    img_left_eye, _, _ = utils.crop_frame(
                        frame, data_l_eye[1] + yomin, data_l_eye[3] + yomin,
                        data_l_eye[0] + xomin, data_l_eye[2] + xomin)

                    #Crop right eye from data generated by landmarks detection model
                    img_right_eye, _, _ = utils.crop_frame(
                        frame, data_r_eye[1] + yomin, data_r_eye[3] + yomin,
                        data_r_eye[0] + xomin, data_r_eye[2] + xomin)

                    #Head pose estmisation model face detection copped_frame output (roll, pitch, yaw)
                    head_PoseEstimat.set_params(cropped_frame, cropped_w,
                                                cropped_h)
                    head_pose_angles = head_PoseEstimat.get_inference_outputs()

                    #Gaze estimation model output vector for eyes direction
                    gaze_Estimation.set_params(img_left_eye, img_right_eye,
                                               head_pose_angles)
                    gaze_vector_output = gaze_Estimation.get_inference_outputs(
                    )

                    ####
                    #eyes_concat = np.concatenate((img_left_eye,img_right_eye), axis=0)
                    #eyes_concat_resized = cv2.resize(eyes_concat,(cropped_frame.shape[1] -200 ,cropped_frame.shape[0]), interpolation=cv2.INTER_AREA)
                    #eyes_crop_out = np.concatenate((cropped_frame, eyes_concat_resized), axis=1)
                    #display_visual = True

                    #Display visualisation according to user cli arguments
                    if args.display_visual == "True":
                        #original_frame = cv2.resize(original_frame,(cropped_frame.shape[1] +400 ,cropped_frame.shape[0]), interpolation=cv2.INTER_AREA)
                        #img_output = np.concatenate((original_frame,cropped_frame), axis=1)
                        frame = utils.draw_visualisation(
                            frame, data_face_detection_points,
                            data_points_marks, head_pose_angles, data_l_eye,
                            data_r_eye, gaze_vector_output)
                    else:
                        frame = original_frame

                    #show the frame(s) in realtime
                    cv2.imshow('frame', frame)

                    #if the user chose "image" as input will be saved
                    if args.input_feed == 'image':
                        cv2.imwrite("../bin/output.jpg", frame)

                    #if the user chose "video" as input will be saved
                    if args.input_feed == 'video' or 'cam':
                        #save the feed to video
                        feed.save_to_video(frame)

                    if args.mouse_move == "True":
                        pass
                        mouse_controller.move(*gaze_vector_output[:2])

                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break
            else:
                break
        # Release everything if job is finished
        feed.close()
    log.info('End inferences ...')
def infer_on_stream(args):
    face_detection_model_file = args.faceDetectionModel
    facial_landmarks_detection_model_file = args.facialLandmarksModel
    head_pose_estimation_model_file = args.headPoseModel
    gaze_estimation_model_file = args.gazeModel

    video_file = args.input
    device_name = args.device
    cpu_extension = args.cpu_extension
    prob_threshold = args.prob_threshold
    preview_flag = args.preview_flag

    output_path = args.output_path
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    mouse_control = MouseController("low", "fast")

    try:
        logging.info("*********** Model Load Time ***************")
        start_model_load_time = time.time()

        start_time = time.time()
        face_detection_model = FaceDetectionModel(face_detection_model_file,
                                                  device_name, cpu_extension)
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        facial_landmarks_detection_model = FacialLandmarksDetectionModel(
            facial_landmarks_detection_model_file, device_name, cpu_extension)
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        head_pose_estimation_model = HeadPoseEstimationModel(
            head_pose_estimation_model_file, device_name, cpu_extension)
        logging.info("Head Pose Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file,
                                                    device_name, cpu_extension)
        logging.info("Gaze Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        total_model_load_time = time.time() - start_model_load_time
        logging.info("*********** Model Load Completed ***********")
    except Exception as e:
        logging.error("ERROR in model loading: " + str(e))
        sys.exit(1)

    feeder = InputFeeder('video', video_file)
    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.fps() / 10), (1920, 1080), True)

    start_inference_time = 0
    frame_count = 0
    face_detect_infer_time = 0
    facial_landmarks_infer_time = 0
    head_pose_infer_time = 0
    gaze_infer_time = 0

    while True:
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break

        key_pressed = cv2.waitKey(60)
        frame_count += 1

        ## Face Detecton Model
        image = face_detection_model.preprocess_input(frame)

        start_time = time.time()
        outputs = face_detection_model.predict(image)
        face_detect_infer_time += (time.time() - start_time)
        out_frame, faces = face_detection_model.preprocess_output(
            outputs, frame, preview_flag, prob_threshold)

        for face in faces:
            crop_image = frame[face[1]:face[3], face[0]:face[2]]

            ## Facial Landmarks Detecton Model
            image = facial_landmarks_detection_model.preprocess_input(
                crop_image)

            start_time = time.time()
            outputs = facial_landmarks_detection_model.predict(image)
            facial_landmarks_infer_time += (time.time() - start_time)
            out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output(
                outputs, out_frame, face, preview_flag)

            ## Head Pose Estimation Model
            image = head_pose_estimation_model.preprocess_input(crop_image)

            start_time = time.time()
            outputs = head_pose_estimation_model.predict(image)
            head_pose_infer_time += (time.time() - start_time)
            out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output(
                outputs, out_frame, preview_flag)

            ## Gaze Estimation Model
            out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(
                out_frame, crop_image, left_eye_point, right_eye_point)

            start_time = time.time()
            outputs = gaze_estimation_model.predict(left_eye, right_eye,
                                                    headpose_angels_list)
            gaze_infer_time += (time.time() - start_time)
            out_frame, gazevector = gaze_estimation_model.preprocess_output(
                outputs, out_frame, face, left_eye_point, right_eye_point,
                preview_flag)

            cv2.imshow("Computer Pointer Control", out_frame)
            out_video.write(out_frame)
            mouse_control.move(gazevector[0], gazevector[1])

        if key_pressed == 27:
            break

    if frame_count > 0:
        logging.info("*********** Model Inference Time ****************")
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * face_detect_infer_time / frame_count))
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * facial_landmarks_infer_time / frame_count))
        logging.info("Head Pose Detection Model: {:.1f} ms.".format(
            1000 * head_pose_infer_time / frame_count))
        logging.info("Gaze Detection Model: {:.1f} ms.".format(
            1000 * gaze_infer_time / frame_count))
        logging.info("*********** Model Inference Completed ***********")

    total_infer_time = time.time() - start_inference_time
    total_inference_time = round(total_infer_time, 1)
    fps = frame_count / total_inference_time

    with open(os.path.join(output_path, 'stats.txt'), 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logging.info("*********** Total Summary ****************")
    logging.info(f"Total Model Load Time: {total_model_load_time}")
    logging.info(f"Total Inference Time: {total_inference_time}")
    logging.info(f"FPS: {fps}")
    logging.info("*********** Total Summary ***********")
    logging.info("*********** ************************* ***********")

    feeder.close()
    cv2.destroyAllWindows()
    hd = head_pose_estimation.head_pose_estimation(head_pose_estimation_model,
                                                   args.device)
    hd.load_model()
    hd.check_model()
    hd.get_input_name()

    gaze_estimation_model = args.gaze_estimation_model
    ge = gaze_estimation.gaze_estimation(gaze_estimation_model, args.device)
    ge.load_model()
    ge.check_model()
    ge.get_input_name()

    #initialize mouse controller
    mouse_controller = MouseController(args.mouse_precision, args.mouse_speed)

    if (args.inputType == 'image'):
        input_image = args.input_path
        feed = InputFeeder(input_type='image', input_file=input_image)
        feed.load_data()
        frame = feed.cap
        _, output_img = process_frame(frame, args.visualize)
        cv2.imshow("Preview", output_img)
        cv2.imwrite(args.output_path, output_img)

    elif (args.inputType == 'video'):
        process_video(args.input_path, args.output_path, args.visualize)
    elif (args.inputType == 'cam'):
        process_video(None, args.output_path, args.visualize)
    else:
        print("Invalid input type")
Esempio n. 9
0
def main():

    args = build_argparser().parse_args()
    inputFilePath = args.input
    inputFeeder = None

    if args.input == "CAM":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video", args.input)

    modelPathDict = {
        'FaceDetectionModel': args.face_detection_model,
        'FacialLandmarksDetectionModel': args.facial_landmark_model,
        'GazeEstimationModel': args.gaze_estimation_model,
        'HeadPoseEstimationModel': args.head_pose_model
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified " + fileNameKey + " xml file")
            sys.exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    start_time_1 = time.time()
    inputFeeder.load_data()

    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()
    total_model_load_time = (time.time() - start_time_1)
    print("Model Load Time: {:.3f}".format(total_model_load_time))

    frame_count = 0
    start_time = time.time()

    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (450, 450)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            log.info("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    log.info("VideoStream has ended.")
    cv2.destroyAllWindows()
    inputFeeder.close()

    total_time = time.time() - start_time
    total_inference_time = total_time
    fps = frame_count / total_inference_time
    print("Inference Time: {:.3f}".format(total_inference_time))
    print("FPS: {}".format(fps))
Esempio n. 10
0
def main():
    args = get_args()

    log.basicConfig(filename='example.log', level=log.DEBUG)

    inputFile = args.input
    #inputFile = "./bin/demo.mp4"

    mouse = MouseController("high", "fast")

    frame_count = 0
    focal_length = 950.0
    scale = 50

    #print(f"Visual flag: {args.visual_flag}")

    if inputFile.lower() == "cam":
        feed = InputFeeder('cam')
        log.info("Video source: " + str(inputFile))

    else:
        if not os.path.isfile(inputFile):
            log.error("Unable to find file: " + inputFile)
            exit(1)
        feed = InputFeeder("video", inputFile)
        log.info("Video source: " + str(inputFile))
        log.info("InputFeeder initialized")

    log.info("Device: " + str(args.device))
    log.info("Face detection model: " + str(args.facedetectionmodel))
    log.info("Facial landmarks model: " + str(args.faciallandmarksmodel))
    log.info("Head pose estimation model: " + str(args.headposemodel))
    log.info("Gaze estimation model: " + str(args.gazeestimationmodel))

    if args.stats == 1:
        print("Running statistics...")
        inference_times = []
        fdm_inference_times = []
        hpm_inference_times = []
        flm_inference_times = []
        gem_inference_times = []
        start_time = time.time()

    # Create instances of the different models
    fdm = FaceDetector(args.facedetectionmodel, args.device,
                       args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        fdm.load_model()
        fdm_load_time = time.time() - start_time
    else:
        fdm.load_model()
    fdm.check_model()

    hpm = HeadPoseEstimator(args.headposemodel, args.device,
                            args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        hpm.load_model()
        hpm_load_time = time.time() - start_time
    else:
        hpm.load_model()
    hpm.check_model()

    flm = FacialLandmarksDetector(args.faciallandmarksmodel, args.device,
                                  args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        flm.load_model()
        flm_load_time = time.time() - start_time
    else:
        flm.load_model()
    flm.check_model()

    gem = GazeEstimator(args.gazeestimationmodel, args.device,
                        args.cpu_extension)
    if args.stats == 1:
        start_time = time.time()
        gem.load_model()
        gem_load_time = time.time() - start_time
    else:
        gem.load_model()
    gem.check_model()

    if args.stats == 1:
        duration_loading = time.time() - start_time
        print(
            f"Duration for loading and checking the models: {duration_loading}"
        )
        log.info(
            f"Duration for loading and checking the models: {duration_loading}"
        )

    cv2.namedWindow('preview', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('preview', 600, 600)

    feed.load_data()
    for ret, frame in feed.next_batch():
        if not ret:
            break

        if frame is not None:
            frame_count += 1
            key = cv2.waitKey(60)

            if args.stats == 1:
                start_time = time.time()

            # Run face detection
            face_crop, face_coords = fdm.predict(frame.copy())
            print("Face crop shape: " + str(face_crop.shape))
            frame_h, frame_w = frame.shape[:2]
            (xmin, ymin, xmax, ymax) = face_coords
            face_frame = frame[ymin:ymax, xmin:xmax]
            #center_of_face = (xmin + face_frame.shape[1] / 2, ymin + face_frame.shape[0] / 2, 0) # 0 for colour channel
            #print("Center of face " + str(center_of_face))

            try:
                # Check if face was detected
                if type(face_coords) == int:
                    print("Unable to detect face")
                    if key == 27:
                        break
                    continue

                # Facial landmark detection
                left_eye_crop, right_eye_crop, landmarks, crop_coords = flm.predict(
                    face_crop.copy())
                #print("Landmarks" +str(landmarks))
                left_eye = (landmarks[0], landmarks[1])
                right_eye = (landmarks[2], landmarks[3])

                # Landmark position based on complete frame
                landmarks_viz = landmarks
                landmarks_viz[0] = landmarks_viz[0] + xmin
                landmarks_viz[1] = landmarks_viz[1] + ymin
                landmarks_viz[2] = landmarks_viz[2] + xmin
                landmarks_viz[3] = landmarks_viz[3] + ymin

                crop_coords_viz = (crop_coords[0] + xmin, crop_coords[1] +
                                   ymin, crop_coords[2] + xmin,
                                   crop_coords[3] + ymin, crop_coords[4] +
                                   xmin, crop_coords[5] + ymin,
                                   crop_coords[6] + xmin,
                                   crop_coords[7] + ymin)

                left_eye_viz = (landmarks_viz[0], landmarks_viz[1])
                right_eye_viz = (landmarks_viz[2], landmarks_viz[3])

                third_eye_viz_x = (landmarks_viz[2] -
                                   landmarks_viz[0]) / 2 + landmarks_viz[0]
                third_eye_viz_y = (landmarks_viz[3] -
                                   landmarks_viz[1]) / 2 + landmarks_viz[1]
                third_eye_viz = (third_eye_viz_x, third_eye_viz_y)
                #print(landmarks_viz[0], landmarks_viz[2], third_eye_viz_x)

                # Head pose estimation
                head_pose = hpm.predict(face_crop.copy())
                print("Head pose: " + str(head_pose))
                (yaw, pitch, roll) = head_pose
                frame = display_head_pose(frame, pitch, roll, yaw)

                # Send inputs to GazeEstimator
                gaze_vector = gem.predict(head_pose, left_eye_crop,
                                          right_eye_crop)

                if args.stats == 1:
                    inference_time = time.time() - start_time
                    inference_times.append(inference_time)

                print(gaze_vector)
                frame = display_gaze(frame, gaze_vector)

                # Control the mouse
                if frame_count % 5 == 0:
                    mouse_x, mouse_y = get_mouse_vector(gaze_vector, roll)
                    print("Mouse vector:" + str(mouse_x) + " - " +
                          str(mouse_y))
                    mouse.move(mouse_x, mouse_y)
                    currentMouseX, currentMouseY = pyautogui.position()
                    print("Mouse coordinates: " + str(currentMouseX) + ", " +
                          str(currentMouseY))

                if args.visual_flag == 1:

                    frame = draw_bounding_box(frame, face_coords)

                    left_eye_frame = crop_coords_viz[0:4]
                    right_eye_frame = crop_coords_viz[4:]
                    frame = draw_bounding_box(frame, left_eye_frame)
                    frame = draw_bounding_box(frame, right_eye_frame)

                    frame = visualize_landmark(frame, left_eye_viz)
                    frame = visualize_landmark(frame,
                                               right_eye_viz,
                                               color=(0, 0, 255))

                    frame = visualize_gaze(frame, gaze_vector, landmarks_viz)

                    # visualize the axes of the HeadPoseEstimator results
                    #frame = hpm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length)
                    frame = hpm.draw_axes(frame.copy(), third_eye_viz, yaw,
                                          pitch, roll, scale, focal_length)
                    #hdm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length)

                cv2.imshow('preview', frame)
                cv2.imshow('left eye', left_eye_crop)
                cv2.imshow('right eye', right_eye_crop)

            except Exception as e:
                print("Unable to predict using model" + str(e) +
                      " for frame " + str(frame_count))
                log.error("Unable to predict using model" + str(e) +
                          " for frame " + str(frame_count))
            continue

    if args.stats == 1:
        avg_inference_time = sum(inference_times) / len(inference_times)
        print("Average inference time: " + str(avg_inference_time))
        log.info("Average inference time: " + str(avg_inference_time))
        log.info("Load time for face detection model: " + str(fdm_load_time))
        log.info("Load time for facial landmarks model: " + str(flm_load_time))
        log.info("Load time for head pose detection model: " +
                 str(hpm_load_time))
        log.info("Load time for gaze estimation model: " + str(gem_load_time))
    cv2.destroyAllWindows()
    feed.close()
def main(args):
    # set log level
    levels = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warning': logging.WARNING,
        'error': logging.ERROR
    }

    log_level = levels.get(args.log_level, logging.ERROR)

    logging.basicConfig(level=log_level)

    mouse_control = MouseController('high', 'fast')

    logging.info("Model Loading Please Wait ..")
    face_det = FaceDetection(args.face_detection, args.device)
    facial_det = FaceLandmark(args.face_landmark, args.device)
    head_pose_est = HeadPoseEstimation(args.head_pose, args.device)
    gaze_est = GazeEstimation(args.gaze_estimation, args.device)
    logging.info("Model loading successfully")

    inp = InputFeeder(input_type='video', input_file=args.input)
    inp.load_data()

    face_det.load_model()
    facial_det.load_model()
    head_pose_est.load_model()
    gaze_est.load_model()

    video_writer = cv2.VideoWriter(args.output_dir + '/demo_output11.mp4',
                                   cv2.VideoWriter_fourcc(*'MPEG'), 15,
                                   (1920, 1080), True)

    cv2.namedWindow('gaze')
    for frame in inp.next_batch():
        try:
            frame.shape
        except Exception as err:
            break
        crop_face, crop_coords = face_det.predict(frame,
                                                  visualize=args.visualize)

        left_eye, right_eye, left_eye_crop, right_eye_crop = facial_det.predict(
            crop_face, visualize=args.visualize)
        head_pose = head_pose_est.predict(crop_face, visualize=args.visualize)

        (new_x, new_y), gaze_vector = gaze_est.predict(left_eye_crop,
                                                       right_eye_crop,
                                                       head_pose)

        left_eye_gaze = int(left_eye[0] +
                            gaze_vector[0] * 100), int(left_eye[1] -
                                                       gaze_vector[1] * 100)
        right_eye_gaze = int(right_eye[0] +
                             gaze_vector[0] * 100), int(right_eye[1] -
                                                        gaze_vector[1] * 100)

        cv2.arrowedLine(crop_face, left_eye, left_eye_gaze, (0, 0, 255), 2)
        cv2.arrowedLine(crop_face, right_eye, right_eye_gaze, (0, 0, 255), 2)

        video_writer.write(frame)
        mouse_control.move(new_x, new_y)

        if args.show_result:
            cv2.imshow('gaze', frame)
            cv2.waitKey(1)

    inp.close()
    video_writer.release()
    cv2.destroyAllWindows()
Esempio n. 12
0
def main():
    """
    """

    # Grab command line args
    args = build_argparser().parse_args()

    input_src = args.input
    device = args.device
    extension = args.cpu_extension
    prob_threshold = args.prob_threshold

    face_detection_model = args.facedetectionmodel
    head_pose_model = args.headposemodel
    landmarks_model = args.facelandmarksnmodel
    gaze_estimation_model = args.gazeestimationmodel

    # Create log object set for console output and set log level
    log_obj = log.getLogger()
    log_obj.setLevel(LOGLEVEL)

    console_handler = log.StreamHandler()
    console_handler.setLevel(LOGLEVEL)
    log_obj.addHandler(console_handler)

    # Create detection objects
    face_detection_obj = FaceDetectionModel(face_detection_model, device,
                                            extension)
    head_pose_obj = HeadPoseModel(head_pose_model, device, extension)
    landmarks_obj = LandmarksModel(landmarks_model, device, extension)
    gaze_estimation_obj = GazeEstimationModel(gaze_estimation_model, device,
                                              extension)

    # Create mouse controller object
    mouse_controller = MouseController('medium', 'fast')
    # Place mouse at the center of the screen
    mouse_controller.init_position()
    log_obj.info("[Info]: Place mouse at the center of the screen")

    # Place holder for total inferencing time
    total_inference_time = 0

    # Load models and get the model loading times
    start_time = time.time()
    face_detection_obj.load_model()
    end_time = time.time()
    face_detection_loading_time = end_time - start_time

    start_time = time.time()
    head_pose_obj.load_model()
    end_time = time.time()
    head_pose_loading_time = end_time - start_time

    start_time = time.time()
    landmarks_obj.load_model()
    end_time = time.time()
    landmarks_detection_loading_time = end_time - start_time

    start_time = time.time()
    gaze_estimation_obj.load_model()
    end_time = time.time()
    gaze_estimation_loading_time = end_time - start_time

    # Configure input video source
    if input_src.lower() == 'cam':
        input_channel = InputFeeder(input_type='cam')
    elif not os.path.exists(input_src):
        log.error("Video file not found! Exiting....")
        exit(1)
    else:
        input_channel = InputFeeder(input_type='video', input_file=input_src)
        log_obj.info("[Info]: Opening video file ...")

    input_channel.load_data()
    video_width = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_channel.cap.get(cv2.CAP_PROP_FPS))

    frame_counter = 0
    total_face_inf_time = 0
    total_head_inf_time = 0
    total_lanmarks_inf_time = 0
    total_gaze_inf_time = 0
    frame_processing_time = 0

    # Process each frame
    try:
        for frame in input_channel.next_batch():
            frame_processing_start_time = time.time()

            frame_counter = frame_counter + 1
            key = cv2.waitKey(60)

            # Use face detection to find cropped face and provide face coordinates
            cropped_face, face_coords, face_inference_time = face_detection_obj.predict(
                frame, prob_threshold)
            total_face_inf_time = total_face_inf_time + face_inference_time

            #  Now use cropped face for head pose detection
            head_pose_estimate, head_inference_time = head_pose_obj.predict(
                cropped_face, prob_threshold)
            total_head_inf_time = total_head_inf_time + head_inference_time

            #  Now use cropped face for landmarks detection
            cropped_left_eye, cropped_right_eye, eyes_coords, converted_landmarks, landmarks_inference_time = landmarks_obj.predict(
                cropped_face, prob_threshold)
            total_lanmarks_inf_time = total_lanmarks_inf_time + landmarks_inference_time

            #  Finally gaze estimation
            gaze_vector, gaze_estimate_time = gaze_estimation_obj.predict(
                cropped_left_eye, cropped_right_eye, head_pose_estimate)
            total_gaze_inf_time = total_gaze_inf_time + gaze_estimate_time

            # Move the mouse
            #mouse_controller.move(gaze_vector[0], gaze_vector[1])

            # Show size-reduced frame for visual comparison

            # Check potential visualize flags: 'F', 'H', 'L', 'G'
            # If flag exist, process image to show inference results
            if args.visualize is not None:

                visualize_flag = str(args.visualize)

                # Draw bounding box around detected face
                if 'F' in visualize_flag:
                    cv2.rectangle(frame,
                                  (face_coords[0][0], face_coords[0][1]),
                                  (face_coords[0][2], face_coords[0][3]),
                                  (0, 255, 0), 2)

                # Show head pose parameters
                if 'H' in visualize_flag:
                    cv2.putText(
                        frame,
                        "Head pose: yaw: {:.3f}, pitch: {:.3f}, roll: {:.3f}".
                        format(head_pose_estimate[0], head_pose_estimate[1],
                               head_pose_estimate[2]), (10, 20),
                        cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5)

                # Draw dots on detected facial landmarks
                if 'L' in visualize_flag:
                    cv2.circle(frame,
                               (converted_landmarks[0] + face_coords[0][0],
                                converted_landmarks[1] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[2] + face_coords[0][0],
                                converted_landmarks[3] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[4] + face_coords[0][0],
                                converted_landmarks[5] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[6] + face_coords[0][0],
                                converted_landmarks[7] + face_coords[0][1]),
                               10, (0, 255, 0), 5)
                    cv2.circle(frame,
                               (converted_landmarks[8] + face_coords[0][0],
                                converted_landmarks[9] + face_coords[0][1]),
                               10, (0, 255, 0), 5)

                # Display gaze parameters
                if 'G' in visualize_flag:
                    cv2.putText(
                        frame,
                        "Gaze estimate: x: {:.3f}, y: {:.3f}, z: {:.3f}".
                        format(gaze_vector[0], gaze_vector[1], gaze_vector[2]),
                        (10, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5)

            resized_frame = cv2.resize(frame, (640, 360))
            cv2.imshow('frame', resized_frame)

            if frame_counter % 4 == 0:
                mouse_controller.move(gaze_vector[0], gaze_vector[1])

            frame_processing_time = frame_processing_time + (
                time.time() - frame_processing_start_time) * 1000

            if key == 27:
                break

    except Exception as e:
        #traceback.print_exc()
        if 'shape' in str(e):
            log_obj.info("Video feed finished")
        else:
            log_obj.error("[ERROR]: " + str(e))
        pass

    # All done, cleaning up
    cv2.destroyAllWindows()
    input_channel.close()

    # Print out statistics
    log_obj.info("[Info]: Video source FPS: " + str(fps))
    log_obj.info("[Info]: Total frame count: " + str(frame_counter))
    log_obj.info("")
    log_obj.info("[Info]: Face detection model loading time: {:.3f} ms".format(
        face_detection_loading_time * 1000))
    log_obj.info("[Info]: Head pose model loading time: {:.3f} ms".format(
        head_pose_loading_time * 1000))
    log_obj.info(
        "[Info]: Facial landmarks detection model loading time: {:.3f} ms".
        format(landmarks_detection_loading_time * 1000))
    log_obj.info(
        "[Info]: Gaze estimation model loading time: {:.3f} ms".format(
            gaze_estimation_loading_time * 1000))
    log_obj.info("")
    log_obj.info(
        "[Info]: Average  per frame total processing time : {:.3f} ms".format(
            frame_processing_time / frame_counter))
    log_obj.info("[Info]: Average face inferencing  time: {:.3f} ms".format(
        total_face_inf_time / frame_counter))
    log_obj.info(
        "[Info]: Average head pose  inferencing  time: {:.3f} ms".format(
            total_head_inf_time / frame_counter))
    log_obj.info(
        "[Info]: Average facial landmarks inferencing  time: {:.3f} ms".format(
            total_lanmarks_inf_time / frame_counter))
    log_obj.info("[Info]: Average gaze estimate  time: {:.3f} ms".format(
        total_gaze_inf_time / frame_counter))
Esempio n. 13
0
def run_controller(args):
    #     print(args.save)
    feeder = None

    if args.input == "cam":
        feeder = InputFeeder("cam")

    elif args.input.endswith('.jpg') or args.input.endswith('.bmp'):
        if not os.path.isfile(args.input):
            log.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder("image", args.input, args.save)

    else:
        if not os.path.isfile(args.input):
            log.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder("video", args.input, args.save)

    feeder.load_data()

    mc = MouseController('medium', 'fast')

    model_face = Face_Detector()
    model_face.load_model(args.model_fd, args.device, args.extension)

    model_pose = Pose_Estimator()
    model_pose.load_model(args.model_pe, args.device, args.extension)

    model_landmark = Facial_Landmarks()
    model_landmark.load_model(args.model_fl, args.device, args.extension)

    model_gaze = Gaze_Estimator()
    model_gaze.load_model(args.model_ge, args.device, args.extension)

    frame_count = 0
    for b, frame in feeder.next_batch():
        frame_count += 1
        preview = np.copy(frame)
        crop_face, face_count, points = model_face.predict(
            preview, args.thres_fd)

        key_pressed = cv2.waitKey(30)
        if (face_count == 0):
            if (b or key_pressed == 27):
                break

            log.error('no face is detected')
            feeder.save_file(preview)
            continue

        angles = model_pose.predict(preview, crop_face)
        left_eye, right_eye, eye_points = model_landmark.predict(
            preview, crop_face, points)

        mx, my = model_gaze.predict(preview, left_eye, right_eye, angles,
                                    eye_points)
        feeder.save_file(preview)

        if key_pressed == 27:
            break

        if frame_count % 5 == 0:
            if args.draw_lines:
                cv2.imshow('video', cv2.resize(preview, (500, 500)))
            else:
                cv2.imshow('video', cv2.resize(frame, (500, 500)))
            mc.move(mx, my)

    feeder.close()
    cv2.destroyAllWindows()
def main():
    args = build_argparser().parse_args()

    log.debug(args)

    # Load face detection model
    faceDetection = ModelFaceDetection(args.face_detection_model,
                                       args.prob_threshold, args.device,
                                       args.cpu_extension)
    start_model_load_time = time.time()
    faceDetection.load_model()
    facedetection_model_load_time = time.time() - start_model_load_time

    log.debug('Facedetection model load time. {}'.format(
        facedetection_model_load_time))

    #Load Head pose estimation model
    headPoseEstimation = ModelHeadPoseEstimation(
        args.headpose_estimation_model, args.prob_threshold, args.device,
        args.cpu_extension)
    start_model_load_time = time.time()
    headPoseEstimation.load_model()
    headposeestimation_model_load_time = time.time() - start_model_load_time

    log.debug('Head pose estimation model load time. {}'.format(
        headposeestimation_model_load_time))

    #Facial landmark model
    facialLandmarkDetection = ModelFacialLandmarkDetection(
        args.landmarks_regression_model, args.prob_threshold, args.device,
        args.cpu_extension)
    start_model_load_time = time.time()
    facialLandmarkDetection.load_model()
    facialLandmarkDetection_model_load_time = time.time(
    ) - start_model_load_time

    log.debug('Facial landmarks detection model load time. {}'.format(
        facialLandmarkDetection_model_load_time))

    #Gaze estimation model
    gazeEstimation = ModelGazeEstimation(args.gaze_estimation_model,
                                         args.prob_threshold, args.device,
                                         args.cpu_extension)
    start_model_load_time = time.time()
    gazeEstimation.load_model()
    gazeEstimation_model_load_time = time.time() - start_model_load_time

    log.debug('Gaze estimation model load time. {}'.format(
        gazeEstimation_model_load_time))

    # Feeder
    feeder = InputFeeder(args.input)
    feeder.load_data()

    counter = 0
    window_name = 'frame'

    facedetection_inference_time_sum = 0
    headpose_inference_time_sum = 0
    faciallandmark_inference_time_sum = 0
    gazeestimation_inference_time_sum = 0

    #Process Framea
    for frame in feeder.next_batch():
        if frame is None:
            break

        key_pressed = cv2.waitKey(1)
        if key_pressed == 27:
            break

        #Face detection
        start_inference_time = time.time()
        face_image, face_coords = faceDetection.predict(frame)
        facedetection_inference_time = time.time() - start_inference_time
        facedetection_inference_time_sum += facedetection_inference_time

        #Head pose estimation
        start_inference_time = time.time()
        yaw, pitch, roll = headPoseEstimation.predict(face_image)
        headpose_inference_time = time.time() - start_inference_time
        headpose_inference_time_sum += headpose_inference_time

        # log.debug('Head pose yaw, pirch ,roll {}, {}, {}'.format(yaw, pitch, roll))

        #Facial landmarks detection
        start_inference_time = time.time()
        left_eye_image, right_eye_image, eye_coords = facialLandmarkDetection.predict(
            face_image)
        faciallandmark_inference_time = time.time() - start_inference_time
        faciallandmark_inference_time_sum += faciallandmark_inference_time

        # cv2.imwrite('left_eye.png', left_eye_image)
        # cv2.imwrite('right_eye.png', right_eye_image)
        # cv2.imwrite('face.png', face_image)

        #Gaze estimation
        start_inference_time = time.time()
        gaze_vector = gazeEstimation.predict(left_eye_image, right_eye_image,
                                             [yaw, pitch, roll])
        gazeestimation_inference_time = time.time() - start_inference_time
        gazeestimation_inference_time_sum += gazeestimation_inference_time

        #log.debug('Gaze Vector {}, {}'.format(gaze_vector[0], gaze_vector[1]))

        #Mouse
        if (counter % 2 == 0):
            mouse = MouseController('high', 'fast')
            mouse.move(gaze_vector[0], gaze_vector[1])

        #Display frame
        if (args.show):
            font = cv2.FONT_HERSHEY_SIMPLEX

            if 0 < len(face_coords):
                #face rect
                fxmin = face_coords[0][0]
                fymin = face_coords[0][1]
                fxmax = face_coords[0][2]
                fymax = face_coords[0][3]

                cv2.rectangle(frame, (fxmin, fymin), (fxmax, fymax),
                              (200, 0, 0), 2)

                #eye rect
                cv2.rectangle(
                    frame,
                    (fxmin + eye_coords[0][0], fymin + eye_coords[0][1]),
                    (fxmin + eye_coords[0][2], fymin + eye_coords[0][3]),
                    (0, 200, 0), 2)
                cv2.rectangle(
                    frame,
                    (fxmin + eye_coords[1][0], fymin + eye_coords[1][1]),
                    (fxmin + eye_coords[1][2], fymin + eye_coords[1][3]),
                    (0, 200, 0), 2)

                #Face position
                length = 100
                yaw = math.radians(yaw)
                pitch = math.radians(-pitch)
                roll = math.radians(roll)
                x1 = int(length * (math.cos(yaw) * math.cos(roll)))
                y1 = int(length *
                         (math.cos(pitch) * math.sin(roll) +
                          math.cos(roll) * math.sin(pitch) * math.sin(yaw)))

                x2 = int(length * (-math.cos(yaw) * math.sin(roll)))
                y2 = int(length *
                         (math.cos(pitch) * math.cos(roll) +
                          math.sin(pitch) * math.sin(yaw) * math.sin(roll)))

                x3 = int(length * (math.sin(yaw)))
                y3 = int(length * (-math.cos(yaw) * math.sin(pitch)))

                cv2.line(frame, (fxmin, fymin), (fxmin + x1, fymin + y1),
                         (0, 255, 0), 2)
                cv2.line(frame, (fxmin, fymin), (fxmin + x2, fymin + y2),
                         (255, 0, 0), 2)
                cv2.line(frame, (fxmin, fymin), (fxmin + x3, fymin + y3),
                         (0, 0, 255), 2)

                #gaze
                x = int(length * gaze_vector[0])
                y = -int(length * gaze_vector[1])

                cv2.line(frame, (fxmax, fymax), (fxmax + x, fymax + y),
                         (0, 255, 255), 5)

            else:
                cv2.putText(frame, 'Face not detected', (10, 10), font, 1,
                            (255, 255, 255), 1)

            cv2.imshow(
                window_name,
                cv2.resize(frame,
                           (int(frame.shape[1] / 3), int(frame.shape[0] / 3))))

        counter += 1

    log.debug("Face detection inference time average {}".format(
        facedetection_inference_time_sum / counter))
    log.debug("Headpose inference time average  {}".format(
        headpose_inference_time_sum / counter))
    log.debug("Faciallandmark inference time average {}".format(
        faciallandmark_inference_time_sum / counter))
    log.debug("Gazeestimation inference time average {}".format(
        gazeestimation_inference_time_sum / counter))

    if (args.show):
        cv2.destroyWindow(window_name)
def main():

    # Grab command line args
    args = build_argparser().parse_args()
    previewFlags = args.previewFlags

    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None
    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)

    modelPathDict = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            logger.error("Unable to find specified " + fileNameKey +
                         " xml file")
            exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

        if (not len(previewFlags) == 0):
            preview_frame = frame.copy()
            if 'fd' in previewFlags:
                #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                preview_frame = croppedFace
            if 'fld' in previewFlags:
                cv2.rectangle(croppedFace,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(croppedFace,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace

            if 'hp' in previewFlags:
                cv2.putText(
                    preview_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in previewFlags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],
                            eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],
                            eye_coords[1][0]:eye_coords[1][2]] = re
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace

            cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500)))

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
Esempio n. 16
0
def main():

    # Grabing command line args
    args = build_argparser().parse_args()
    # Getting Input File Path
    inputFilePath = args.input
    # For Visualization
    visual_flag = args.visualization_flag
    # Initialize inputfeeder
    inputFeeder = None
    
    # Handle video file or CAM (like webcam)
    if args.input =="CAM":
            inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video",args.input)

    
    # Now define model path dictionary for all 04 intel pre trained models
    modelPathDict = {'FaceDetectionModel':args.face_detection_model, 'FacialLandmarksDetectionModel':args.facial_landmark_model, 
    'GazeEstimationModel':args.gaze_estimation_model, 'HeadPoseEstimationModel':args.head_pose_model}
    
    # Check model XML file
    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified "+fileNameKey+" xml file")
            sys.exit(1)
    
    # Defining Intel Pre Trained Models Objects
    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension)
    
    # Determining Precision and Speed for mouse controller 
    mc = MouseController('medium','fast')
    
    # Loading Input Feeder
    inputFeeder.load_data()
    
    # Loading our four pre trained models and calculate the total models loading time
    # This will help us to find different model time for different models precison like F32,F16 & INT8
    
    start_time_1= time.time()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()
    total_model_load_time= (time.time()-start_time_1)
    print("Total Model Load Time for All our Intel Pre Trained Models is (in seconds): {:.3f}".format(total_model_load_time))
    # Above print statement will give total model load time for our 04 models for different precisions as well
    
    
    frame_count = 0
    start_time = time.time()
    
    # Start Loop till break through input feeder
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count+=1
        if frame_count%5==0:
            cv2.imshow('video',cv2.resize(frame,(450,450)))
    
        key = cv2.waitKey(60)
        # Extracting face detection features
        croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold)
        if type(croppedFace)==int:
            log.info("Unable to detect the face.")
            if key==27:
                break
            continue
        
        # Head position detection
        hp_out = hpem.predict(croppedFace.copy())
        
        # Landmarks detection (left_eye, right_eye, eyes coordinates)
        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())
        
        # Mouse coordinates and gaze vector Detection
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)
        
        # Creating variables for visualisation
        # Extracting four face coordinates for rectangle (xmin,ymin,xmax,ymax) 
        x_minimum= face_coords[0]
        y_minimum=face_coords[1]
        x_maximum=face_coords[2]
        y_maximum=face_coords[3]
        
        # Take eye surrounding area
        eye_surrounding_area=10
        
        # Now extracting few features from eye coordinates
        # Extracting four coordinates of left eye from eye coordinates
        l_l1= eye_coords[0][0]
        l_l2=eye_coords[0][1]
        l_l3=eye_coords[0][2]
        l_l4=eye_coords[0][3]
        
        # Extracting four coordinates of left eye from eye coordinates
        r_r1=eye_coords[1][0]
        r_r2=eye_coords[1][1]
        r_r3=eye_coords[1][2]
        r_r4=eye_coords[1][3]
        
        # Extracting pose angle, pitch and roll from head pose output
        pose_angle= hp_out[0]
        pitch=hp_out[1]
        roll=hp_out[2]
            
        # Visualizing face, landmarks, head pose and gaze
        if (not len(visual_flag)==0):
            preview_frame = frame.copy()
            if 'fd' in visual_flag:
                # Drawing a rectangle with our four face coordiantes (xmin,ymin,xmax,ymax)
                cv2.rectangle(preview_frame, (x_minimum, y_minimum), (x_maximum, y_maximum), (20,20,150), 3)
                
            if 'fld' in visual_flag:
                # Drawing a rectangle for each eyes with the help of eye coordinates and eye surrounding area
                # Left Eye
                cv2.rectangle(preview_frame, (l_l1-eye_surrounding_area, l_l2-eye_surrounding_area), (l_l3+eye_surrounding_area, l_l4+eye_surrounding_area), (60,255,0), 2)
                # Right Eye
                cv2.rectangle(preview_frame, (r_r1-eye_surrounding_area, r_r2-eye_surrounding_area), (r_r3+eye_surrounding_area, r_r4+eye_surrounding_area), (60,255,0), 2)
                
            if 'hp' in visual_flag:
                # We have extracted pose angle, pitch and roll from head pose output, now we put text on preview_frame
                cv2.putText(preview_frame, "Pose Angles:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(pose_angle, pitch, roll), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 60), 1)
                
                
            if 'ge' in visual_flag:
                # Calculating coordinates for left eye to obtain left eye center
                le_x= (l_l1 + l_l3)/2
                le_y= (l_l2 + l_l4)/2
                # Calculating coordinates for right eye to obtain right eye center
                re_x= (r_r1 + r_r3)/2
                re_y= (r_r2 + r_r4)/2
                # Calculating left eye center
                le_center= int(x_minimum + le_x), int(y_minimum + le_y)
                # Calculating right eye center
                re_center= int(x_minimum + re_x), int(y_minimum + re_y)
                # Now put both eyes center in a list                
                eyes_center = [le_center, re_center ]
                # Extracting left eye x and y coordinates from eyes_center
                le_center_x = int(eyes_center[0][0])
                le_center_y = int(eyes_center[0][1])
                # Extracting right eye x and y coordinates from eyes_center
                re_center_x = int(eyes_center[1][0])
                re_center_y = int(eyes_center[1][1])
                # Extracting x and y (first and second) value from gaze_vector
                g_x, g_y = gaze_vector[0:2]
                
                # With the help of above parameters, draw arrowed lines for gaze on left and right eyes
                cv2.arrowedLine(preview_frame, (le_center_x, le_center_y), (le_center_x + int(g_x * 100), le_center_y + int(-g_y * 100)), (0,50,160), 1)
                cv2.arrowedLine(preview_frame, (re_center_x, re_center_y), (re_center_x + int(g_x * 100), re_center_y + int(-g_y * 100)), (0,50,160), 1)
                
            
            cv2.imshow("visualization",cv2.resize(preview_frame,(450,450)))
        
        if frame_count%5==0:
            mc.move(new_mouse_coord[0],new_mouse_coord[1])    
        if key==27:
            break
    log.info("VideoStream has been ended")
    cv2.destroyAllWindows()
    inputFeeder.close()
    
    # Calculating Inference time and frame per seconds
    total_time = time.time() - start_time
    total_inference_time=total_time
    fps=frame_count/total_inference_time
    print("Inference time: {:.3f}".format(total_inference_time))
    print("FPS: {}".format(fps))
Esempio n. 17
0
def model_pipelines(args):
    
    # Parameters which were parsed are assigned
    
    #device = args.dev
    #customLayers = args.lay
    inputFile = args.inp
    visual_flag = args.vf
    
    faceDetectionModel = args.mfd
    landmarksDetectionModel = args.mld
    headPoseEstimationModel = args.mhp
    gazeDetectionModel = args.mgd
    start_time = time.time()
    # Logging is enabled 
    log = logging.getLogger(__name__)
    
    log.info('----------THE BEGINNING----------')
    log.info('Start Time: {0}'. format(str(start_time))) 

    # The feed is initialised
    single_image = ['jpg','tif','png','jpeg', 'bmp']
    if inputFile.split(".")[-1].lower() in single_image:
        input_feed = InputFeeder('image', inputFile)
    elif args.inp == 'cam':
        input_feed = InputFeeder('cam')
    else:
        input_feed = InputFeeder('video', inputFile)

    # Feed data is loaded
    log.info('Loading data...')
    input_feed.load_data()
    log.info('Data Loaded. Beginning inference...')

    # The models are initialised and loaded here

    face_model_load_start_time = time.time()
    landmark_model_load_start_time = time.time()
    headpose_model_load_start_time = time.time()
    gaze_model_load_start_time = time.time()
    
    ppl_fd = Face_Detection(faceDetectionModel)
    ppl_fl = Facial_Landmarks_Detection(landmarksDetectionModel)
    ppl_hd = Head_Pose_Estimation(headPoseEstimationModel)
    ppl_ge = Gaze_Estimation(gazeDetectionModel)
    
    face_model_load_time = time.time() - face_model_load_start_time
    landmark_model_load_time = time.time() - landmark_model_load_start_time
    headpose_model_load_time = time.time() - headpose_model_load_start_time
    gaze_model_load_time = time.time() - gaze_model_load_start_time
    
    log.info('Face Detection object initialized')
    log.info('Facial Landmarks object initialized')
    log.info('Head Pose object initialized')
    log.info('Gaze object initialized')
    
    log.info('All models loaded and checked')
    
    load_time = [face_model_load_time, landmark_model_load_time, headpose_model_load_time, gaze_model_load_time]
      
    # count the number of frames
    frameCount = 0

    # collate frames from the feeder and feed into the detection pipelines
    for _, frame in input_feed.next_batch():

        if not _:
            break
        frameCount += 1
        
        if frameCount % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(100)
        
        # Get the time for the model inference
        face_inference_start_time = time.time()
        face_crop = ppl_fd.predict(frame)
        face_inference_time = time.time() - face_inference_start_time
        
        if 'mfd' in visual_flag:
            cv2.imshow('The cropped face', face_crop)
            
        if type(face_crop) == int:
            log.info("No face can be detected")
            
            if key == 27:
                break
            
            continue
        
        # Get the time for the model inference
        landmark_inference_start_time = time.time()
        eye_image_left, eye_image_right, face_landmarked = ppl_fl.predict(face_crop.copy())
        landmark_inference_time = time.time() - landmark_inference_start_time
       
        # Get face landmark results
        if 'mld' in visual_flag:
            cv2.imshow('Face output', face_landmarked)
            
        if eye_image_left.any() == None or eye_image_right.any() == None:
            log.info("Landmarks could not be detected, check that the eyes are visible and the image is bright")
            continue
        
        # Get the time for the model inference
        headpose_inference_start_time = time.time()
        head_pose_angles, head_pose_image = ppl_hd.predict(face_crop.copy())   
        headpose_inference_time = time.time() - headpose_inference_start_time
        
        # Get head pose results
        if 'mhp' in visual_flag:
            cv2.imshow('Head Pose Angles', head_pose_image)
        
        # Get the time for the model inference
        gaze_inference_start_time = time.time()
        coord_x, coord_y = ppl_ge.predict(eye_image_left ,eye_image_right, head_pose_angles)
        gaze_inference_time = time.time() - gaze_inference_start_time

        # Get gaze detection results
        if 'mgd' in visual_flag:
            cv2.putText(face_landmarked, "Estimated x:{:.2f} | Estimated y:{:.2f}".format(coord_x, coord_y), (10,20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0,255,0),1)
            cv2.imshow('Gaze Estimation', face_landmarked)


        mCoord = MouseController('medium','fast')
        
        # Move the mouse based on the coordinates received
        if frameCount % 5 == 0:
            mCoord.move(coord_x, coord_y)

        if key == 27:
            break
        
        inference_time = [face_inference_time, landmark_inference_time, headpose_inference_time, gaze_inference_time]
        results(args, inference_time, load_time)
        
        if key == ord('x'):
            log.warning('KeyboardInterrupt: `X` was pressed')
            results(args, inference_time, load_time)
            sys.exit()
        
        
    log.info('End Time: {0}'. format(str(time.time() - start_time)))   
    log.info('----------THE END----------')
    cv2.destroyAllWindows()
    input_feed.close()
Esempio n. 18
0
def main():

    # Grab command line args
    args = build_argparser().parse_args()
    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None
    inference_time = None

    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)
    #else:
    #	if not os.path.isfile(inputFilePath):
    #       logger.error("Unable to find specified image file")
    #       exit(1)
    #   inputFeeder = InputFeeder("image",inputFilePath)

    # Initialize variables with the input arguments
    modelPathDict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarksDetectionModel': args.FacialLandmarksDetectionModel,
        'GazeEstimationModel': args.gazeEstimationModel,
        'HeadPoseEstimationModel': args.HeadPoseEstimationModel
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            logger.error("Unable to find specified " + fileNameKey +
                         " xml file")
            exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    flm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpe = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                  args.device, args.cpu_extension)
    mc = MouseController('high', 'fast')

    inputFeeder.load_data()

    # Load Models and generate load times

    start_time = time.time()
    fdm.load_model()
    logger.error("Face detection model loaded: time: {:.3f} ms".format(
        (time.time() - start_time) * 1000))
    first_mark = time.time()
    flm.load_model()
    logger.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format(
            (time.time() - first_mark) * 1000))
    second_mark = time.time()
    hpe.load_model()
    logger.error("Head pose estimation model loaded: time: {:.3f} ms".format(
        (time.time() - second_mark) * 1000))
    third_mark = time.time()
    gem.load_model()
    logger.error("Gaze estimation model loaded: time: {:.3f} ms".format(
        (time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger.error("Total loading time: time: {:.3f} ms".format(load_total_time *
                                                              1000))
    logger.error("Required models have been loaded..")

    frame_count = 0
    start_inf_time = time.time()
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (600, 800)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])

        hp_out = hpe.predict(croppedFace.copy())
        left_eye, right_eye, eye_coords = flm.predict(croppedFace.copy())
        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)

    inference_time = round(time.time() - start_inf_time, 1)
    total_frames = int(frame_count)
    fps = int(frame_count) / (inference_time)
    logger.error("count {} seconds".format(frame_count))
    logger.error("total inference time {} seconds".format(inference_time))
    logger.error("total frames {} frames".format(frame_count))
    logger.error("fps {} frame/second".format(fps))

    with open(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'RunReport.txt'), 'w') as R:
        R.write('Load Time: ' + str(load_total_time) + '\n')
        R.write('Inference Time :' + str(inference_time) + '\n')
        R.write('total frames processed' + str(total_frames) + '\n')
        R.write('fps: ' + str(fps) + '\n')
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
    atexit.register(profile.print_stats)
Esempio n. 19
0
    facial_landmark_model.load_model()
    facial_landmark_time = (time.time() - facial_landmark_start) * 1000

    gaze_model_start = time.time()
    gaze_estimation_model.load_model()
    gaze_model_time = (time.time() - gaze_model_start) * 1000

    total_loading_time = (time.time() - start_time) * 1000

    face_model.check_model()
    head_pose_model.check_model()
    facial_landmark_model.check_model()
    gaze_estimation_model.check_model()

    if input_file.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_file):
            logger.error("Unable to find video file for input")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_file)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS))

    writer = None
    green_color = (0, 255, 0)
    blue_color = (255, 0, 0)
    red_color = (0, 0, 255)
Esempio n. 20
0
def test_run(args):
    logging.getLogger().setLevel(logging.INFO)
    feeder = None
    activate_frame_count = 10
    logging.warning("Running default value activate frame count = 10")
    if args.input_type == 'video' or args.input_type == 'image':
        feeder = InputFeeder(args.input_type, args.input)
        if args.input == '../bin/demo.mp4':
            logging.warning("Running default setting and input")
    elif args.input_type == 'webcam':
        feeder = InputFeeder(args.input_type, args.input)
    else:
        logging.error("Input not found")
        exit(1)

    mouse_controller = MouseController(args.precision, args.speed)

    feeder.load_data()
    start_time = 0

    face_model_load_time = 0
    start_time = time.time()
    face_model = FaceDetection(args.face, args.device, args.cpu_extension)
    face_model.load_model()
    face_model_load_time = time.time() - start_time
    logging.info("Face Detection Model Loaded...")

    head_pose_estimation_load_time = 0
    start_time = time.time()
    head_pose_estimation = HeadPoseEstimation(args.headpose, args.device,
                                              args.cpu_extension)
    head_pose_estimation.load_model()
    head_pose_estimation_load_time = time.time() - start_time
    logging.info("Head Pose Detection Model Loaded...")

    facial_landmarks_detection_load_time = 0
    start_time = time.time()
    facial_landmarks_detection = FacialLandmarksDetection(
        args.landmarks, args.device, args.cpu_extension)
    facial_landmarks_detection.load_model()
    facial_landmarks_detection_load_time = time.time() - start_time
    logging.info("Facial Landmark Detection Model Loaded...")

    gaze_model_load_time = 0
    start_time = time.time()
    gaze_model = GazeEstimation(args.gazeestimation, args.device,
                                args.cpu_extension)
    gaze_model.load_model()
    gaze_model_load_time = time.time() - start_time
    logging.info("Gaze Estimation Model Loaded...")

    frame_count = 0

    total_face_model_inference_time = 0
    total_head_pose_estimation_inference_time = 0
    total_facial_landmarks_detection_inference_time = 0
    total_gaze_model_inference_time = 0
    start_time = 0
    for frame in feeder.next_batch():
        if frame is None:
            break
        frame_count += 1
        key = cv2.waitKey(60)

        start_time = time.time()
        first_face_box, first_face = face_model.predict(frame.copy())
        total_face_model_inference_time = total_face_model_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        head_pose_output = head_pose_estimation.predict(first_face_box.copy())
        total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        left_eye, right_eye, eye_coords = facial_landmarks_detection.predict(
            first_face_box.copy())
        total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        move_to_coors_mouse = gaze_model.predict(left_eye, right_eye,
                                                 head_pose_output)
        total_gaze_model_inference_time = total_gaze_model_inference_time + (
            time.time() - start_time)

        if frame_count % activate_frame_count == 0 and (args.flag == "3"
                                                        or args.flag == "4"):
            mouse_controller.move(move_to_coors_mouse[0],
                                  move_to_coors_mouse[1])
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        if key == 27:
            break

        if args.flag == "1":
            cv2.rectangle(frame, (first_face[0], first_face[1]),
                          (first_face[2], first_face[3]), (255, 0, 0))
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        elif args.flag == "2":
            cv2.rectangle(facial_landmarks_detection.image,
                          (eye_coords[0], eye_coords[1]),
                          (eye_coords[2], eye_coords[3]), (255, 0, 0))
            cv2.imshow('video', facial_landmarks_detection.image)
            key = cv2.waitKey(60)
        elif args.flag == "3":
            if frame_count == 1:
                logging.info("Printing mouse coors: ")
            logging.info(move_to_coors_mouse)

    #Print Report
    if args.flag == "0":
        print('------------- BEGIN REPORT -------------')
        avg_inference_face_model = total_face_model_inference_time / frame_count
        avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count
        avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count
        avg_inference_gaze_model = total_gaze_model_inference_time / frame_count

        print("Face Detection Model Load Time: ", args.face)
        print("Loading time: ", face_model_load_time)
        print("Inference time: ", avg_inference_face_model)

        print("Head Pose Detection Model: ", args.headpose)
        print("Loading time: ", head_pose_estimation_load_time)
        print("Inference time:", avg_inference_headpose)

        print("Facial Landmark Detection Model Load Time: ", args.landmarks)
        print("Loading time: ", facial_landmarks_detection_load_time)
        print("Inference time:", avg_inference_facial_landmark)

        print("Gaze Estimation Model Load Time: ", args.gazeestimation)
        print("Loading time: ", gaze_model_load_time)
        print("Inference time:", avg_inference_gaze_model)

        print('------------- END REPORT -------------')
Esempio n. 21
0
class Application:
    def __init__(self):
        self.args = None
        self.feed = None
        self.face_detection_model = None
        self.facial_landmark_detection_model = None
        self.gaze_estimation_model = None
        self.head_pose_estimation_model = None
        self.frame = None
        self.width = None
        self.Height = None
        self.mc = MouseController("high", "fast")
        self.face_detection_load_time = 0
        self.facial_landmark_detection_load_time = 0
        self.gaze_estimation_load_time = 0
        self.head_pose_estimation_load_time = 0
        self.face_detection_infer_time = 0
        self.facial_landmark_detection_infer_time = 0
        self.gaze_estimation_infer_time = 0
        self.head_pose_estimation_infer_time = 0
        self.frames = 0

    def initialize_argparser(self):
        """
        Parse command line arguments.

        :return: command line arguments
        """
        parser = ArgumentParser()
        parser.add_argument("-t",
                            "--input-type",
                            required=True,
                            type=str,
                            help="Type of input (video or cam)")
        parser.add_argument("-i",
                            "--input",
                            required=True,
                            type=str,
                            help="Input file")
        parser.add_argument("-o",
                            "--out",
                            type=str,
                            default=None,
                            help="Output file with the processed content")
        parser.add_argument("-p",
                            "--preview",
                            action='store_true',
                            default=False,
                            help="Should preview face and eyes")
        parser.add_argument("--notmove",
                            action='store_true',
                            default=False,
                            help="Should not move mouse")
        parser.add_argument(
            "-m",
            "--model",
            type=str,
            default="FP32",
            help="Model precision to use. One of FP32, FP16 or FP16-INT8")
        parser.add_argument(
            "-d",
            "--device",
            type=str,
            default="CPU",
            help="Device used to process model. One or CPU or GPU")
        parser.add_argument("-v",
                            "--verbose",
                            action='store_true',
                            default=False,
                            help="Enable DEBUG messages")

        self.args = parser.parse_args()

    def initialize_logging(self):
        if self.args.verbose:
            log.basicConfig(level=log.DEBUG)
        else:
            log.basicConfig(level=log.ERROR)

    def initialize_feed(self):
        self.feed = InputFeeder(self.args.input_type, self.args.input)
        self.feed.load_data()

    def initialize_window(self):
        if self.args.preview:
            cv2.namedWindow('preview')
            cv2.namedWindow('face')
            cv2.namedWindow('left eye')
            cv2.namedWindow('right eye')
            cv2.namedWindow('gaze')

    def show_main_frame(self):
        cv2.imshow('preview', self.frame)

    def esc_key_pressed(self):
        key_pressed = cv2.waitKey(1)
        if key_pressed == 27:
            return True

    def infer_face(self):
        start = time.time()
        face_frame = self.face_detection_model.predict(self.frame)
        self.face_detection_infer_time += time.time() - start
        return face_frame

    def infer_eyes(self, face_frame, show=False):
        start = time.time()
        left_eye_pos, right_eye_pos, left_eye, right_eye = self.facial_landmark_detection_model.predict(
            face_frame)
        self.facial_landmark_detection_infer_time += time.time() - start

        if show:
            tmp_face = face_frame.copy()
            cv2.circle(tmp_face, (left_eye_pos[0], left_eye_pos[1]), 5,
                       (0, 255, 0))
            cv2.circle(tmp_face, (right_eye_pos[0], right_eye_pos[1]), 5,
                       (0, 255, 0))
            cv2.imshow('face', tmp_face)
            cv2.imshow('left eye', left_eye)
            cv2.imshow('right eye', right_eye)

        return left_eye, right_eye

    def infer_pose(self, face_frame, show=False):
        start = time.time()
        yaw, pitch, roll = self.head_pose_estimation_model.predict(face_frame)
        self.head_pose_estimation_infer_time += time.time() - start
        return yaw, pitch, roll

    def infer_gaze(self,
                   cropped_left_eye,
                   cropped_right_eye,
                   yaw,
                   pitch,
                   roll,
                   show=False):
        start = time.time()
        gaze = self.gaze_estimation_model.predict(cropped_left_eye,
                                                  cropped_right_eye, yaw,
                                                  pitch, roll)
        self.gaze_estimation_infer_time += time.time() - start
        if show:
            img = np.zeros([100, 100, 3], dtype=np.uint8)
            img.fill(255)
            cv2.circle(img, (50, 50), 50, (0, 255, 0))
            cv2.arrowedLine(img, (50, 50),
                            (50 + int(gaze[0] * 70), 50 + int(-gaze[1] * 70)),
                            (255, 0, 0), 2)
            cv2.imshow('gaze', img)
        return gaze

    def infer_frame(self):
        self.show_main_frame()
        if self.esc_key_pressed():
            return False
        self.frames += 1
        face_frame = self.infer_face()
        if face_frame is not None:
            cropped_left_eye, cropped_right_eye = self.infer_eyes(
                face_frame, self.args.preview)
            yaw, pitch, roll = self.infer_pose(face_frame, self.args.preview)
            gaze = self.infer_gaze(cropped_left_eye, cropped_right_eye, yaw,
                                   pitch, roll, self.args.preview)
            if not self.args.notmove:
                self.mc.move(gaze[0], gaze[1])

    def process_feed(self):
        try:
            for batch in self.feed.next_batch():
                self.frame = batch
                if batch is not None:
                    if self.infer_frame() is False:
                        break
                else:
                    break

            log.info("Face detection model load time: {:.2f}ms".format(
                1000 * self.face_detection_infer_time))
            log.info(
                "Facial landmark detection model load time: {:.2f}ms".format(
                    1000 * self.facial_landmark_detection_infer_time))
            log.info("Head Pose estimation model load: {:.2f}ms".format(
                1000 * self.head_pose_estimation_infer_time))
            log.info("Gaze estimation model load time: {:.2f}ms".format(
                1000 * self.gaze_estimation_infer_time))

            log.info(
                "Face detection model inference mean time: {:.2f}ms".format(
                    1000 * self.face_detection_infer_time / self.frames))
            log.info(
                "Facial landmark detection model inference mean time: {:.2f}ms"
                .format(1000 * self.facial_landmark_detection_infer_time /
                        self.frames))
            log.info(
                "Head Pose estimation model inference mean time: {:.2f}ms".
                format(1000 * self.head_pose_estimation_infer_time /
                       self.frames))
            log.info(
                "Gaze estimation model inference mean time: {:.2f}ms".format(
                    1000 * self.gaze_estimation_infer_time / self.frames))

        except Exception as err:
            log.error("Could not infer. Cause: ", str(err))

    def initialize_models(self):
        try:
            model_precision = self.args.model.upper()

            self.face_detection_model = Model_Face_Detection(
                "models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001"
            )
            start = time.time()
            self.face_detection_model.load_model()
            self.face_detection_load_time = time.time() - start

            self.facial_landmark_detection_model = Model_Facial_Landmark_Detection(
                f"models/intel/landmarks-regression-retail-0009/{model_precision}/landmarks-regression-retail-0009",
                self.args.device.upper())
            start = time.time()
            self.facial_landmark_detection_model.load_model()
            self.facial_landmark_detection_load_time = time.time() - start

            self.head_pose_estimation_model = Model_Head_Pose_estimation(
                f"models/intel/head-pose-estimation-adas-0001/{model_precision}/head-pose-estimation-adas-0001",
                self.args.device.upper())
            start = time.time()
            self.head_pose_estimation_model.load_model()
            self.head_pose_estimation_load_time = time.time() - start

            self.gaze_estimation_model = Model_Gaze_Estimation(
                f"models/intel/gaze-estimation-adas-0002/{model_precision}/gaze-estimation-adas-0002",
                self.args.device.upper())
            start = time.time()
            self.gaze_estimation_model.load_model()
            self.gaze_estimation_load_time = time.time() - start
        except Exception as err:
            log.error("Could not load model. Cause: ", str(err))

    def run(self):
        self.initialize_argparser()
        self.initialize_logging()
        self.initialize_models()
        self.initialize_feed()
        self.initialize_window()
        self.process_feed()
        self.feed.close()
Esempio n. 22
0
def main():
    args = build_argparser().parse_args()
    input_file = args.input
    logger = log.getLogger()
    if input_file == "CAM":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file):
            logger.error("Path should be file")
            exit(1)
        input_feeder = InputFeeder("video", input_file)

    face_detector = FaceDetector(
        args.face_detection_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    face_landmark_detector = FaceLandmarkDetector(
        args.face_landmark_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    head_pose_estimator = HeadPoseEstimator(
        args.head_pose_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    gaze_estimator = GazeEstimator(
        args.gaze_estimation_model,
        device=args.device,
        threshold=args.threshold,
        extensions=args.extensions,
    )
    mouse_controller = MouseController("medium", "fast")

    face_detector.load_model()
    face_landmark_detector.load_model()
    head_pose_estimator.load_model()
    gaze_estimator.load_model()

    input_feeder.load_data()

    width = 1000
    height = int(width * 9 / 16)

    for flag, frame in input_feeder.next_batch():

        if not flag:
            break
        pressed_key = cv2.waitKey(60)

        face_detected = face_detector.predict(frame)
        if face_detected:
            face_coordinates, face_image = face_detected
            if not face_coordinates:
                continue
        else:
            continue
        if "fd" in args.visualization:
            cv2.rectangle(
                frame,
                (face_coordinates[0], face_coordinates[1]),
                (face_coordinates[2], face_coordinates[3]),
                (36, 255, 12),
                2,
            )
            cv2.putText(
                frame,
                "Face Detected",
                (face_coordinates[0], face_coordinates[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9,
                (36, 255, 12),
                2,
            )

        left_eye_img, righ_eye_img, eye_coords = face_landmark_detector.predict(
            face_image
        )
        if "fl" in args.visualization:
            frame_eye_coords_min = (
                np.array(eye_coords)[:, :2] + np.array(face_coordinates)[:2]
            )
            frame_eye_coords_max = (
                np.array(eye_coords)[:, 2:] + np.array(face_coordinates)[:2]
            )
            cv2.rectangle(
                frame,
                (frame_eye_coords_min[0][0], frame_eye_coords_min[0][1]),
                (frame_eye_coords_max[0][0], frame_eye_coords_max[0][1]),
                (36, 255, 12),
                2,
            )
            cv2.rectangle(
                frame,
                (frame_eye_coords_min[1][0], frame_eye_coords_min[1][1]),
                (frame_eye_coords_max[1][0], frame_eye_coords_max[1][1]),
                (36, 255, 12),
                2,
            )

        head_pose_estimate = head_pose_estimator.predict(face_image)
        if "hp" in args.visualization:
            cv2.putText(
                frame,
                "yaw:{:.1f}|pitch:{:.1f}|roll:{:.1f}".format(*head_pose_estimate),
                (20, 35),
                cv2.FONT_HERSHEY_COMPLEX,
                1.2,
                (36, 255, 12),
                3,
            )

        mouse_coordinate, gaze_vector = gaze_estimator.predict(
            left_eye_img, righ_eye_img, head_pose_estimate
        )
        if "ge" in args.visualization:
            head_pose_estimate = np.array(head_pose_estimate)
            yaw, pitch, roll = head_pose_estimate * np.pi / 180.0

            focal_length = 950
            scale = 100

            origin = (
                int(
                    face_coordinates[0]
                    + (face_coordinates[2] - face_coordinates[0]) / 2
                ),
                int(
                    face_coordinates[1]
                    + (face_coordinates[3] - face_coordinates[1]) / 2
                ),
            )

            r_x = np.array(
                [
                    [1, 0, 0],
                    [0, math.cos(pitch), -math.sin(pitch)],
                    [0, math.sin(pitch), math.cos(pitch)],
                ]
            )
            r_y = np.array(
                [
                    [math.cos(yaw), 0, -math.sin(yaw)],
                    [0, 1, 0],
                    [math.sin(yaw), 0, math.cos(yaw)],
                ]
            )
            r_z = np.array(
                [
                    [math.cos(roll), -math.sin(roll), 0],
                    [math.sin(roll), math.cos(roll), 0],
                    [0, 0, 1],
                ]
            )
            r = r_z @ r_y @ r_x

            zaxis = np.array(([0, 0, -1 * scale]), dtype="float32")
            offset = np.array(([0, 0, focal_length]), dtype="float32")
            zaxis = np.dot(r, zaxis) + offset
            tip = (
                int(zaxis[0] / zaxis[2] * focal_length) + origin[0],
                int(zaxis[1] / zaxis[2] * focal_length) + origin[1],
            )

            cv2.arrowedLine(frame, origin, tip, (0, 0, 255), 3, tipLength=0.3)

        cv2.imshow("frame", cv2.resize(frame, (width, height)))
        mouse_controller.move(mouse_coordinate[0], mouse_coordinate[1])

        if pressed_key == 27:
            logger.error("exit key is pressed..")
            break
Esempio n. 23
0
def main(args):
    device = args.device
    video_file = args.video
    input_type = args.input_type
    toggle = args.toggle
    stats = args.stats
    model = args.model

    if stats == 'true':
        stats = True
    else:
        stats = False

    if toggle == 'true':
        toggle = True
    else:
        toggle = False

    # Start Model Loading
    start_model_load_time = time.time()
    print(f'[INFO] Started Model Loading...........')

    face_model = FaceDetection(parse_models_file(
        label='face_detection', path=model),
        device)
    face_model.load_model()

    # Load Landmark model
    landmark_model = LandMarksDetection(
        parse_models_file(label='facial_landmarks_detection', path=model),
        device)
    landmark_model.load_model()
    pose_estimation_model = HeadPoseEstimation(
        parse_models_file(label='head_pose_estimation', path=model),
        device)
    pose_estimation_model.load_model()

    gaze_estimation_model = GazeEstimation(
        parse_models_file(label='gaze_estimation', path=model), device)
    gaze_estimation_model.load_model()

    total_model_load_time = time.time() - start_model_load_time
    print('[TOTAL] Loaded in {:.3f} ms'.format(total_model_load_time))

    # End Model Loading
    mouse = MouseController('high', 'fast')
    if not toggle:
        cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_AUTOSIZE)
    try:
        feed = InputFeeder(input_type=input_type, input_file=video_file)
        feed.load_data()
        initial_w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        initial_h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        counter = 0
        if not toggle:
            cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_NORMAL)
        for frame, _ in feed.next_batch():
            if not _:
                break
            try:
                counter += 1
                # Start Inferences
                coord = face_model.predict(frame, (initial_w, initial_h))

                for i in range(len(coord)):
                    xmin, ymin, xmax, ymax = coord[i]
                    cropped_image = frame[ymin:ymax, xmin:xmax]
                    # Landmark Inference
                    cropped_left, cropped_right = landmark_model.predict(cropped_image)
                    if cropped_right.shape[0] < 60 or cropped_left.shape[1] < 60:
                        break
                    if cropped_right.shape[1] < 60 or cropped_left.shape[0] < 60:
                        break
                    # Pose Estimation Inference
                    poses = pose_estimation_model.predict(cropped_image)
                    # Gaze Estimation Inference
                    gz = gaze_estimation_model.predict(poses, cropped_left, cropped_right)
                    # Mouse Controller
                    mouse.move(gz[0][0], gz[0][1])
                    # If user pass statistics argument to true
                    if stats:
                        # Print performance
                        performance_counts(
                            face_model.performance_counter(0)
                        )
                        performance_counts(
                            pose_estimation_model.performance_counter(0)
                        )
                        performance_counts(
                            landmark_model.performance_counter(0)
                        )
                        performance_counts(
                            gaze_estimation_model.performance_counter(0)
                        )

                if not toggle:
                    # Output Camera or Video
                    #cv2.resizeWindow(MAIN_WINDOW_NAME, 480, 320)
                    cv2.imshow(MAIN_WINDOW_NAME, frame)

                else:
                    # Print Statistics only no camera or video
                    performance_counts(
                        face_model.performance_counter(0)
                    )
                    performance_counts(
                        pose_estimation_model.performance_counter(0)
                    )
                    performance_counts(
                        landmark_model.performance_counter(0)
                    )
                    performance_counts(
                        gaze_estimation_model.performance_counter(0)
                    )

                cv2.waitKey(1)
            except Exception as e:
                print('Could not run Inference', e)

        feed.close()
    except Exception as e:
        print("Could not run Inference: ", e)
                            default=[],
                            help="Optional model visualization flags."
                                 "fd = Face Detection, fld = Facial Landmark Detection, hp for Head Pose Estimation, ge for Gaze Estimation"
                                 "Flags should be separated by space." )    
    return parser




args = build_argparser().parse_args()
visualizationFlags = args.visualizationFlags

inputFilePath = args.input
inputFeeder = None
if inputFilePath.lower()=="cam":
        inputFeeder = InputFeeder("cam")
else:
    if not os.path.isfile(inputFilePath):
        print("Unable to find specified video file")
        exit(1)
    inputFeeder = InputFeeder("video",inputFilePath)

modelPathDict = {'FaceDetectionModel':args.facedetectionmodel, 'FacialLandmarksDetectionModel':args.faciallandmarkmodel, 
'GazeEstimationModel':args.gazeestimationmodel, 'HeadPoseEstimationModel':args.headposemodel}

for fileNameKey in modelPathDict.keys():
    if not os.path.isfile(modelPathDict[fileNameKey]):
        print("Unable to find specified "+fileNameKey+" xml file")
        exit(1)
        
face_detection = Model_FaceDetection(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension)
Esempio n. 25
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger()

    if args.input_type == 'video' or args.input_type == 'image':
        extension = str(args.input).split('.')[1]
        feeder = InputFeeder(args.input_type, args.input)
    elif args.input_type == 'cam':
        feeder = InputFeeder(args.input_type)

    mc = MouseController("medium", "fast")
    feeder.load_data()

    face_model = FaceDetectionModel(args.facedetectionmodel, args.device,
                                    args.cpu_extension)
    face_model.check_model()

    landmark_model = Landmark_Model(args.facelandmarkmodel, args.device,
                                    args.cpu_extension)
    landmark_model.check_model()

    gaze_model = Gaze_Estimation_Model(args.gazeestimationmodel, args.device,
                                       args.cpu_extension)
    gaze_model.check_model()

    head_model = Head_Pose_Model(args.headposemodel, args.device,
                                 args.cpu_extension)
    head_model.check_model()

    face_model.load_model()
    logger.info("Face Detection Model Loaded...")

    landmark_model.load_model()
    logger.info("Landmark Detection Model Loaded...")

    head_model.load_model()
    logger.info("Head Pose Detection Model Loaded...")

    gaze_model.load_model()
    logger.info("Gaze Estimation Model Loaded...")

    logger.info('All Models are loaded\n\n')
    out = cv2.VideoWriter('output_video.mp4', 0x00000021, 30, (500, 500))

    frame_count = 0
    for ret, frame in feeder.next_batch():
        if not ret:
            break
            frame_count += 1

        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))
        key = cv2.waitKey(60)
        faceROI = None

        if True:
            faceROI, box = FaceDetectionModel.predict(frame.copy(),
                                                      args.prob_threshold)
            if faceROI is None:
                logger.error("Unable to detect the face.")
                if key == 27:
                    break
                continue

            (lefteye_x, lefteye_y), (
                righteye_x, righteye_y
            ), eye_coords, left_eye, right_eye = FaceLandmarkModel.predict(
                faceROI.copy(), EYE_ROI=10)
            head_position = HeadPoseModel.predict(faceROI.copy())
            new_mouse_coord, gaze_vector = EyeGazeModel.predict(
                left_eye.copy(), right_eye.copy(), head_position)

            if (not len(previewFlags) == 0):
                preview_frame = frame.copy()
                if 'fd' in previewFlags:
                    #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                    preview_frame = croppedFace
                if 'fld' in previewFlags:
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                        (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                        (0, 255, 0), 3)
                    cv2.rectangle(
                        croppedFace,
                        (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                        (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                        (0, 255, 0), 3)
                    #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                if 'hp' in previewFlags:
                    cv2.putText(
                        preview_frame,
                        "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                        format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                        cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
                if 'ge' in previewFlags:
                    x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                            12), 160
                    le = cv2.line(left_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    re = cv2.line(right_eye.copy(), (x - w, y - w),
                                  (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255),
                             2)
                    croppedFace[eye_coords[0][1]:eye_coords[0][3],
                                eye_coords[0][0]:eye_coords[0][2]] = le
                    croppedFace[eye_coords[1][1]:eye_coords[1][3],
                                eye_coords[1][0]:eye_coords[1][2]] = re
                    #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                #cv2.imshow("visualization",cv2.resize(preview_frame,(500,500)))
                out.write(frame)

            if frame_count % 5 == 0:
                mc.move(new_mouse_coord[0], new_mouse_coord[1])
            if key == 27:
                break

    logger.error("VideoStream ended...")
    out.release()
    cv2.destroyAllWindows()
    inputFeeder.close()
Esempio n. 26
0
def main(args):
    mouse_controller = MouseController('medium', 'fast')
    device = args.device
    extension = args.cpu_extension
    input_path = args.input
    prob = args.prob_threshold
    if input_path.lower() == "cam":
        input_image = InputFeeder("cam")
    else:
        if os.path.isfile(input_path):
            input_image = InputFeeder("video", input_path)
        else:
            print("Invalid path to file used: {}".format(input_path))
            exit(1)
    fd_model = Model_Face_Detection()
    fl_model = Model_Landmark()
    g_model = Model_Gaze()
    p_model = Model_Head_Pose()

    fd_model.load_model(args.face_detection, extension, device)
    fl_model.load_model(args.face_landmark, extension, device)
    g_model.load_model(args.gaze_detection, extension, device)
    p_model.load_model(args.pose_detection, extension, device)

    input_image.load_data()
    frame_count = 0

    for flag, frame in input_image.next_batch():
        if not flag:
            break
        frame_count += 1
        pressed_key = cv2.waitKey(60)
        # Get image crop of image from face detection
        fd_coords = fd_model.predict(frame, prob)
        if len(fd_coords) == 0:
            print("No face found...")
            if pressed_key == 27:
                break
            else:
                continue
        # Get first face available
        fd_coords = fd_coords[0]
        # Crop image [ymin:ymax, xmin:xmax]
        cropped_image = frame[fd_coords[1]:fd_coords[3],
                              fd_coords[0]:fd_coords[2]]
        yaw, pitch, roll = p_model.predict(cropped_image)
        left_eye, right_eye = fl_model.predict(cropped_image)
        left_eye_img = cropped_image[left_eye[1]:left_eye[3],
                                     left_eye[0]:left_eye[2]]
        right_eye_img = cropped_image[right_eye[1]:right_eye[3],
                                      right_eye[0]:right_eye[2]]
        if left_eye_img.shape != (20, 20,
                                  3) and right_eye_img.shape != (20, 20, 3):
            print("Could not find eyes...")
            continue
        if left_eye_img.shape != (20, 20, 3):
            print("Could not find left eye..")
            left_eye_img = right_eye_img
        elif right_eye_img.shape != (20, 20, 3):
            print("Could not find right eye..")
            right_eye_img = left_eye_img
        #Estimate gaze
        mouse_x, mouse_y = g_model.predict(left_eye_img, right_eye_img,
                                           [yaw, pitch, roll])
        if args.visual:
            # Face Outline
            cv2.rectangle(frame, (fd_coords[0], fd_coords[1]),
                          (fd_coords[2], fd_coords[3]), (0, 255, 100))
            # Eye Outlines
            size = 20
            left_cornerx = left_eye[0] + fd_coords[0]
            left_cornery = left_eye[1] + fd_coords[1]
            left_eye = [
                left_cornerx, left_cornery, left_cornerx + size,
                left_cornery + size
            ]
            right_cornerx = right_eye[0] + fd_coords[0]
            right_cornery = right_eye[1] + fd_coords[1]
            right_eye = [
                right_cornerx, right_cornery, right_cornerx + size,
                right_cornery + size
            ]
            cv2.rectangle(frame, (left_eye[0], left_eye[1]),
                          (left_eye[2], left_eye[3]), (0, 10, 200),
                          thickness=4)
            cv2.rectangle(frame, (right_eye[0], right_eye[1]),
                          (right_eye[2], right_eye[3]), (0, 10, 200),
                          thickness=4)

        cv2.imshow("Image", frame)
        # Perfomance Dependacy
        if frame_count % 5 == 0 and args.no_move:
            mouse_controller.move(mouse_x, mouse_y)
Esempio n. 27
0
    def main(args):
        ## loading models
        try:
            input_file = args.input
            mode_visualization = args.mode_visualization

            if input_file == "CAM":
                input_feeder = InputFeeder("cam")
            else:
                if not os.path.isfile(input_file):
                    log.error("ERROR: INPUT PATH IS NOT VALID")
                    exit(1)
                input_feeder = InputFeeder("video", input_file)

            face_detection_class = Face_Detection(
                model=args.face_detection,
                device=args.device,
                extensions=args.cpu_extension)
            face_landmarks_class = Landmarks_Detection(
                model=args.face_landmark,
                device=args.device,
                extensions=args.cpu_extension)
            head_pose_class = Head_Pose(model=args.head_pose,
                                        device=args.device,
                                        extensions=args.cpu_extension)
            gaze_estimation_class = Gaze_Estimation(
                model=args.gaze_estimation,
                device=args.device,
                extensions=args.cpu_extension)

            mouse_control = MouseController('medium', 'fast')
            start_time = time.time()

            ## Load the models one by one and all necessary info

            face_det_time = time.time()
            face_detection_class.load_model()
            print("Face Detection Load Time: time: {:.3f} ms".format(
                (time.time() - face_det_time) * 1000))

            face_land_time = time.time()
            face_landmarks_class.load_model()
            print("Facial landmarks load Time: time: {:.3f} ms".format(
                (time.time() - face_land_time) * 1000))

            head_po_time = time.time()
            head_pose_class.load_model()
            print("Head pose load time: time: {:.3f} ms".format(
                (time.time() - head_po_time) * 1000))

            gaze_est_time = time.time()
            gaze_estimation_class.load_model()
            print("Gaze estimation load time: time: {:.3f} ms".format(
                (time.time() - gaze_est_time) * 1000))

            total_time = time.time() - start_time
            print("Total loading time taken: time: {:.3f} ms".format(
                total_time * 1000))

            print("All models are loaded successfully..")

            input_feeder.load_data()
            print("Feeder is loaded")
        except:
            print('Error occured on loading models in app')

        ## performing inferences
        try:
            start_inference_time = time.time()
            frame_count = 0
            for flag, frame in input_feeder.next_batch():
                if not flag:
                    break
                frame_count += 1
                if frame_count == 0:
                    cv2.imshow('video', cv2.resize(frame, (700, 700)))

                key = cv2.waitKey(60)
                crop_face, face_coords = face_detection_class.predict(
                    frame.copy(), args.conf_threshold)
                if type(crop_face) == int:
                    log.error("Unable to detect the face.")
                    if key == 27:
                        break
                    continue

                ## perform inference
                head_angle = head_pose_class.predict(crop_face.copy())
                left_eye, right_eye, eye_coords = face_landmarks_class.predict(
                    crop_face.copy())
                mouse_position, gaze_vector = gaze_estimation_class.predict(
                    left_eye, right_eye, head_angle)

                ## checking for extra flags
                if (not len(mode_visualization) == 0):
                    p_frame = frame.copy()
                    if ('fd' in mode_visualization):
                        p_frame = crop_face
                    if ('fl' in mode_visualization):
                        cv2.rectangle(
                            crop_face,
                            (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                            (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                            (0, 255, 0), 1)
                        cv2.rectangle(
                            crop_face,
                            (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                            (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (
                                0,
                                255,
                                0,
                            ), 1)

                    if ('hp' in mode_visualization):
                        cv2.putText(
                            p_frame,
                            "Head Positions: :{:.2f} :{:.2f} :{:.2f}".format(
                                head_angle[0], head_angle[1],
                                head_angle[2]), (10, 20),
                            cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)

                    if ('ge' in mode_visualization):
                        i, j, k = int(gaze_vector[0] * 12), int(
                            gaze_vector[1] * 12), 160

                        l_eye = cv2.line(left_eye.copy(), (i - k, j - k),
                                         (i + k, j + k), (0, 255, 255), 2)
                        cv2.line(l_eye, (i - k, j + k), (i + k, j - k),
                                 (255, 0, 255), 2)

                        r_eye = cv2.line(right_eye.copy(), (i - k, j - k),
                                         (i + k, j + k), (0, 255, 255), 2)
                        cv2.line(r_eye, (i - k, j + k), (i + k, j - k),
                                 (0, 255, 255), 2)

                        l_eye = crop_face[eye_coords[0][1]:eye_coords[0][3],
                                          eye_coords[0][0]:eye_coords[0][2]]
                        r_eye = crop_face[eye_coords[1][1]:eye_coords[1][3],
                                          eye_coords[1][0]:eye_coords[1][2]]

                    cv2.imshow("visual for client",
                               cv2.resize(p_frame, (700, 700)))

                if frame_count % 1 == 0:
                    mouse_control.move(mouse_position[0], mouse_position[1])
                if key == 27:
                    break
            ## working on inference time and frames per second
            total_infer_time = time.time() - start_inference_time
            frames_per_sec = int(frame_count) / total_infer_time

            print("Time counter: {:.3f} seconds".format(frame_count))
            print("Total inference time: {:.3f} seconds".format(
                total_infer_time))
            print("FPs: {:.3f} fps ".format(frames_per_sec))
        except:
            print('Error on performing inference in app file')

        print("All Done...")

        cv2.destroyAllWindows()
        input_feeder.close()
def main():

    try:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler("Computer_Pointer_Controller.log"),
                logging.StreamHandler()
            ])
    except:
        print("File cannot be created")

    args = build_argparser()
    video_path = args.i
    visualize = args.flags
    count = 0
    fd_inference_time = 0
    fld_inference_time = 0
    hp_inference_time = 0
    ge_inference_time = 0

    MC = MouseController('medium', 'fast')

    logging.info("############## Model Load Time #############")

    start_time = time.time()
    first_model_time = start_time
    FD = Face_Detection(device=args.d, threshold=args.prob, extensions=args.l)
    FD.load_model(model_path=args.f)
    logging.info("Face Detection Model: {:.3f}ms".format(
        1000 * (time.time() - first_model_time)))

    second_model_time = time.time()
    FLD = Facial_Landmarks_Detection(device=args.d, extensions=args.l)
    FLD.load_model(model_path=args.fl)
    logging.info("Facial Landmarks Detection Model: {:.3f}ms".format(
        1000 * (time.time() - second_model_time)))

    third_model_time = time.time()
    HPE = Head_Pose_Estimation(device=args.d, extensions=args.l)
    HPE.load_model(model_path=args.hp)
    logging.info("Head Pose Estimation Model: {:.3f}ms".format(
        1000 * (time.time() - third_model_time)))

    fourth_model_time = time.time()
    GE = Gaze_Estimation(device=args.d, extensions=args.l)
    GE.load_model(model_path=args.g)
    logging.info("Gaze Estimation Model: {:.3f}ms".format(
        1000 * (time.time() - fourth_model_time)))
    logging.info("############## End ######################### ")

    Total_Model_Load_Time = 1000 * (time.time() - start_time)

    ##### LOADING VIDEO FILE #####

    if (video_path == "cam"):
        IF = InputFeeder("cam")
    else:
        IF = InputFeeder("video", video_path)
    IF.load_data()

    ##### MODEL INFERENCE #####

    start_inf_time = time.time()
    for flag, frame in IF.next_batch():

        if not flag:
            break

        if (count % 5 == 0):
            cv2.imshow('frame', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)

        count = count + 1

        start_time_1 = time.time()
        face, face_coordinates = FD.predict(frame, args.it)
        fd_inference_time += (time.time() - start_time_1)

        start_time_2 = time.time()
        left_eye_image, right_eye_image, eye_coordinates = FLD.predict(
            face, args.it)
        fld_inference_time += (time.time() - start_time_2)

        start_time_3 = time.time()
        head_pose_angles = HPE.predict(face, args.it)
        hp_inference_time += (time.time() - start_time_3)

        start_time_4 = time.time()
        mouse_coordinates, gaze_vector = GE.predict(left_eye_image,
                                                    right_eye_image,
                                                    head_pose_angles, args.it)
        ge_inference_time += (time.time() - start_time_4)

        if (len(visualize) != 0):
            frame_visualize = frame.copy()

            if ("fd" in visualize):
                if (len(visualize) == 1):
                    cv2.rectangle(frame_visualize,
                                  (face_coordinates[0], face_coordinates[1]),
                                  (face_coordinates[2], face_coordinates[3]),
                                  (255, 0, 255), 2)
                else:
                    frame_visualize = face.copy()

            if ("fld" in visualize):
                if not "fd" in visualize:
                    frame_visualize = face.copy()

                cv2.circle(frame_visualize, (eye_coordinates['left_eye'][0],
                                             eye_coordinates['left_eye'][1]),
                           25, (0, 0, 255), 2)
                cv2.circle(frame_visualize, (eye_coordinates['right_eye'][0],
                                             eye_coordinates['right_eye'][1]),
                           25, (0, 0, 255), 2)

            if ("hp" in visualize):
                cv2.putText(
                    frame_visualize,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(head_pose_angles[0], head_pose_angles[1],
                           head_pose_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.255, (0, 255, 0), 1)

            if ("ge" in visualize):
                h = face.shape[0]
                arrow = h * 0.7
                arrow_X = gaze_vector[0] * arrow
                arrow_Y = -gaze_vector[1] * arrow
                cv2.arrowedLine(
                    frame_visualize, (eye_coordinates['left_eye'][0],
                                      eye_coordinates['left_eye'][1]),
                    (int(eye_coordinates['left_eye'][0] + arrow_X),
                     int(eye_coordinates['left_eye'][1] + arrow_Y)),
                    (255, 0, 0), 2)
                cv2.arrowedLine(
                    frame_visualize, (eye_coordinates['right_eye'][0],
                                      eye_coordinates['right_eye'][1]),
                    (int(eye_coordinates['right_eye'][0] + arrow_X),
                     int(eye_coordinates['right_eye'][1] + arrow_Y)),
                    (255, 0, 0), 2)
            if (count % 5 == 0):

                cv2.imshow('Visualization',
                           cv2.resize(frame_visualize, (500, 500)))

        if (count % 5 == 0):
            MC.move(mouse_coordinates[0], mouse_coordinates[1])

        if key == 27:
            break

    Total_Inference_Time = time.time() - start_inf_time
    if (count > 0):
        logging.info("############## Models Inference time #######")
        logging.info("Face Detection:{:.3f}ms".format(
            1000 * fd_inference_time / count))
        logging.info("Facial Landmarks Detection:{:.3f}ms".format(
            1000 * fld_inference_time / count))
        logging.info("Headpose Estimation:{:.3f}ms".format(
            1000 * hp_inference_time / count))
        logging.info("Gaze Estimation:{:.3f}ms".format(
            1000 * ge_inference_time / count))
        logging.info("############## End #########################")

    logging.info("############## Summarized Results ##########")
    logging.info(
        "Total Model Load Time: {:.3f}ms".format(Total_Model_Load_Time))
    logging.info("Total Inference Time: {:.3f}s".format(Total_Inference_Time))
    logging.info("FPS:{}".format(count / Total_Inference_Time))
    logging.info("############ End ###########################")
    cv2.destroyAllWindows()
    IF.close()
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """

    #if args.input == 'cam':
    #    args.input = 0
    output_intermediate_model = args.output_intermediate_model

    ### TODO: Handle the input stream ###
    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    cap = feed.load_data()
    width = int(cap.get(3))
    height = int(cap.get(4))
    fps = int(cap.get(5))

    # Initialise the class
    try:
        infer_network_face_detection = BasePointer()
        infer_network_head_pose_estimation = BasePointer()
        infer_network_landmarks_regression_retail = BasePointer()
        infer_network_gaze_estimation = GazeEstimation()
    except:
        logging.error("Error in initializing models")
        exit(1)
    ### TODO: Load the model through `infer_network_face_detection` ###
    try:
        start_loading_time_face_detection = time.time()
        infer_network_face_detection.load_model(args.model1, args.device)
        load_model_face_detection_time_taken = time.time(
        ) - start_loading_time_face_detection

        start_loading_time_head_pose_estimation = time.time()
        infer_network_head_pose_estimation.load_model(args.model2, args.device)
        load_model_head_pose_estimation_time_taken = time.time(
        ) - start_loading_time_head_pose_estimation

        start_loading_time_landmarks_regression_retail = time.time()
        infer_network_landmarks_regression_retail.load_model(
            args.model3, args.device)
        load_model_landmarks_regression_retail_time_taken = time.time(
        ) - start_loading_time_landmarks_regression_retail

        start_loading_time_gaze_estimation = time.time()
        infer_network_gaze_estimation.load_model(args.model4, args.device)
        load_model_gaze_estimation_time_taken = time.time(
        ) - start_loading_time_gaze_estimation
    except:
        logging.error("Error in loading the models")
        exit(1)

    logging.debug(
        "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(load_model_face_detection_time_taken,
                load_model_landmarks_regression_retail_time_taken,
                load_model_head_pose_estimation_time_taken,
                load_model_gaze_estimation_time_taken))

    if output_intermediate_model == 'true':
        out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height))

    total_time_taken_to_infer_inf_face_detection = 0
    total_time_taken_to_infer_landmarks_regression_retail = 0
    total_time_taken_to_infer_inf_head_pose_estimation = 0
    total_time_taken_to_infer_gaze_estimation = 0

    ### TODO: Loop until stream is over ###
    for batch in feed.next_batch():
        ### TODO: Read from the video capture ###

        flag, frame = batch
        if not flag:
            break
        key_pressed = cv2.waitKey(60)

        ### TODO: Start inference for face detection ###
        start_inf_face_detection = time.time()
        outputs_face_detection = infer_network_face_detection.predict(frame)
        time_taken_to_infer_inf_face_detection = time.time(
        ) - start_inf_face_detection
        coords, frame = infer_network_face_detection.preprocess_output_face_detection(
            outputs_face_detection, width, height, args.prob_threshold, frame)
        if output_intermediate_model == 'true':
            out.write(frame)

        frame_crop_face = crop_face(coords, frame, output_intermediate_model)

        start_inf_head_pose_estimation = time.time()
        outputs_head_pose_estimation = infer_network_head_pose_estimation.predict(
            frame_crop_face)
        time_taken_to_infer_inf_head_pose_estimation = time.time(
        ) - start_inf_head_pose_estimation

        yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation(
            outputs_head_pose_estimation, frame_crop_face)
        head_pose_angles = [yaw, pitсh, roll]

        if output_intermediate_model == 'true':
            cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)

        height_crop_face = coords[0][3] - coords[0][1]
        width_crop_face = coords[0][2] - coords[0][0]

        start_inf_landmarks_regression_retail = time.time()
        outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict(
            frame_crop_face)
        time_taken_to_infer_landmarks_regression_retail = time.time(
        ) - start_inf_landmarks_regression_retail

        coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail(
            outputs_landmarks_regression_retail, width_crop_face,
            height_crop_face, args.prob_threshold, frame)
        center_left_eye = ((coords[0][0] +
                            coord_landmarks_regression_retail[0]),
                           coords[0][1] + coord_landmarks_regression_retail[1])
        center_right_eye = ((coords[0][0] +
                             coord_landmarks_regression_retail[2]),
                            coords[0][1] +
                            coord_landmarks_regression_retail[3])

        xmin_left_eye = center_left_eye[0] - 30
        ymin_left_eye = center_left_eye[1] - 30
        xmax_left_eye = center_left_eye[0] + 30
        ymax_left_eye = center_left_eye[1] + 30
        xmin_right_eye = center_right_eye[0] - 30
        ymin_right_eye = center_right_eye[1] - 30
        xmax_right_eye = center_right_eye[0] + 30
        ymax_right_eye = center_right_eye[1] + 30

        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_left_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_right_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye),
                                     (xmax_left_eye, ymax_left_eye),
                                     (0, 255, 0), 3)
        box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye),
                                      (xmax_right_eye, ymax_right_eye),
                                      (0, 255, 0), 3)
        if output_intermediate_model == 'true':
            out.write(frame_landmarks_regression_retail)

        ### TODO: Start inference for gaze estimation ###
        start_inf_gaze_estimation = time.time()
        outputs_gaze_estimation = infer_network_gaze_estimation.predict(
            box_left_eye, box_right_eye, head_pose_angles)
        time_taken_to_infer_gaze_estimation = time.time(
        ) - start_inf_gaze_estimation

        total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection
        total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail
        total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation
        total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation

        arrow = 100
        g_x = int(outputs_gaze_estimation[0] * arrow)
        g_y = int(-(outputs_gaze_estimation[1]) * arrow)

        frame = cv2.arrowedLine(frame, (center_left_eye),
                                ((center_left_eye[0] + g_x),
                                 (center_left_eye[1] + g_y)), (0, 0, 255), 3)
        frame = cv2.arrowedLine(frame, (center_right_eye),
                                ((center_right_eye[0] + g_x),
                                 (center_right_eye[1] + g_y)), (0, 0, 255), 3)

        if output_intermediate_model == 'true':
            out.write(frame)

        mouse_controler_pc = MouseController("high", "fast")
        mouse_controler_pc.move(outputs_gaze_estimation[0],
                                outputs_gaze_estimation[1])

        if key_pressed == 27:
            break
    feed.close()

    logging.debug(
        "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(total_time_taken_to_infer_inf_face_detection,
                total_time_taken_to_infer_landmarks_regression_retail,
                total_time_taken_to_infer_inf_head_pose_estimation,
                total_time_taken_to_infer_gaze_estimation))
    if output_intermediate_model == 'true':
        out.release()
    #cap.release()
    cv2.destroyAllWindows()
class Inferencer:
    def __init__(self,
                 device='CPU',
                 mouse_con=False,
                 face_dec=None,
                 fac_land=None,
                 head_pose=None,
                 gaze=None,
                 show_video=False,
                 save_video=False):
        '''
        all models should be put in here 
        '''
        if face_dec and fac_land and head_pose and gaze:
            self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel(
                face_dec, device=device), FacialLandmarksDetection(
                    fac_land, device=device), Head_Pose_Estimation(
                        head_pose,
                        device=device), Gaze_Estimation(gaze, device=device)
            self.face_dec.load_model()
            self.fac_land.load_model()
            self.head_pose.load_model()
            self.gaze.load_model()
        else:
            raise ValueError('Missing Arguments')

        if mouse_con:
            self.mouse_con = MouseController("low", "fast")

        self.show_video, self.save_video = show_video, save_video

    def __call__(
        self,
        input_type=None,
        input_file=None,
    ):
        self.run(input_type=input_type, input_file=input_file)

    def run(
        self,
        input_type=None,
        input_file=None,
    ):
        if input_type and input_file:
            self.input_ = InputFeeder(input_type, input_file)
            self.input_.load_data()
            if self.save_video:
                out = cv2.VideoWriter(
                    'output.mp4', 0x00000021, 30,
                    (int(self.input_.cap.get(3)), int(self.input_.cap.get(4))))
        try:
            fc_dec_inf_time = 0
            landmark_inf_time = 0
            pose_inf_time = 0
            gaze_inf_time = 0
            frame_counter = 0
            while True:
                # Read the next frame
                try:
                    frame = next(self.input_.next_batch())
                    frame_counter += 1
                except StopIteration:
                    break

                key_pressed = cv2.waitKey(60)

                # face detection
                start = time.time()
                out_frame, boxes = self.face_dec.predict(frame,
                                                         display_output=True)
                fc_dec_inf_time += (time.time() - start)

                #for each box
                for box in boxes:
                    face = out_frame[box[1]:box[3], box[0]:box[2]]

                    start = time.time()
                    out_frame, left_eye_point, right_eye_point = self.fac_land.predict(
                        out_frame, face, box, display_output=True)
                    landmark_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, headpose_angels = self.head_pose.predict(
                        out_frame, face, box, display_output=True)
                    pose_inf_time += (time.time() - start)

                    start = time.time()
                    out_frame, gazevector = self.gaze.predict(
                        out_frame,
                        face,
                        box,
                        left_eye_point,
                        right_eye_point,
                        headpose_angels,
                        display_output=True)
                    gaze_inf_time += (time.time() - start)

                    if self.show_video:
                        cv2.imshow('im', out_frame)

                    if self.save_video:
                        out.write(out_frame)

                    if self.mouse_con:
                        self.mouse_con.move(gazevector[0], gazevector[1])

                    time.sleep(1)

                    #consider only first detected face in the frame
                    break

                # Break if escape key pressed
                if key_pressed == 27:
                    break

            if self.save_video:
                out.release()
            self.input_.close()
            cv2.destroyAllWindows()
            print(
                'average inference time for face detection model is :- {:2f}ms'
                .format((fc_dec_inf_time / frame_counter) * 1000))
            print(
                'average inference time for facial landmark model is :- {:2f}ms'
                .format((landmark_inf_time / frame_counter) * 1000))
            print(
                'average inference time for head pose estimation model is :- {:2f}ms'
                .format((pose_inf_time / frame_counter) * 1000))
            print(
                'average inference time for gaze estimation model is :- {:2f}ms'
                .format((gaze_inf_time / frame_counter) * 1000))
        except Exception as ex:
            logging.exception("Error in inference: " + str(ex))