Ejemplo n.º 1
0
def run_inference(args):

    feed = InputFeeder(input_type='video', input_file=args.input)
    feed.load_data()
    for batch in feed.next_batch():
        cv2.imshow("Output", cv2.resize(batch, (500, 500)))
        key = cv2.waitKey(60)

        if (key == 27):
            break

        # getting face
        faceDetection = FaceDetection(model_name=args.face_detection_model)
        faceDetection.load_model()
        face = faceDetection.predict(batch)

        # getting eyes
        facialLandmarksDetection = FacialLandmarksDetection(
            args.facial_landmarks_detection_model)
        facialLandmarksDetection.load_model()
        left_eye, right_eye = facialLandmarksDetection.predict(face)

        # getting head pose angles
        headPoseEstimation = HeadPoseEstimation(
            args.head_pose_estimation_model)
        headPoseEstimation.load_model()
        head_pose = headPoseEstimation.predict(face)
        print("head pose angles: ", head_pose)

        # get mouse points
        gazeEstimation = GazeEstimation(args.gaze_estimation_model)
        gazeEstimation.load_model()
        mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose)
        print("gaze  output: ", mouse_coords)
    feed.close()
Ejemplo n.º 2
0
class MoveMouse:
    '''
    Main Class for the Mouse Controller app. 
    This is the class where all the models are stitched together to control the mouse pointer
    '''
    def __init__(self, args):
        '''
        This method instances variables for the Facial Landmarks Detection Model.

        Args:
        args = All arguments parsed by the arguments parser function

        Return:
        None
        '''

        init_start_time = time.time()
        self.output_path = args.output_path
        self.show_output = args.show_output
        self.total_processing_time = 0
        self.count_batch = 0
        self.inference_speed = []
        self.avg_inference_speed = 0

        if args.all_devices != 'CPU':
            args.face_device = args.all_devices
            args.face_landmark_device = args.all_devices
            args.head_pose_device = args.all_devices
            args.gaze_device = args.all_devices

        model_init_start = time.time()
        self.face_model = FaceDetection(args.face_model, args.face_device,
                                        args.face_device_ext,
                                        args.face_prob_threshold)
        self.landmarks_model = FacialLandmarksDetection(
            args.face_landmark_model, args.face_landmark_device,
            args.face_landmark_device_ext, args.face_landmark_prob_threshold)
        self.head_pose_model = HeadPoseEstimation(
            args.head_pose_model, args.head_pose_device,
            args.head_pose_device_ext, args.head_pose_prob_threshold)
        self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device,
                                         args.gaze_device_ext,
                                         args.gaze_prob_threshold)
        self.model_init_time = time.time() - model_init_start
        log.info('[ Main ] All required models initiallized')

        self.mouse_control = MouseController(args.precision, args.speed)
        log.info('[ Main ] Mouse controller successfully initialized')

        self.input_feeder = InputFeeder(args.batch_size, args.input_type,
                                        args.input_file)
        log.info('[ Main ] Initialized input feeder')

        model_load_start = time.time()
        self.face_model.load_model()
        self.landmarks_model.load_model()
        self.head_pose_model.load_model()
        self.gaze_model.load_model()

        self.model_load_time = time.time() - model_load_start
        self.app_init_time = time.time() - init_start_time
        log.info('[ Main ] All moadels loaded to Inference Engine\n')

        return None

    def draw_face_box(self, frame, face_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of faces drawn on it
        '''

        start_point = (face_coords[0][0], face_coords[0][1])
        end_point = (face_coords[0][2], face_coords[0][3])
        thickness = 5
        color = (255, 86, 0)

        frame = cv2.rectangle(frame, start_point, end_point, color, thickness)

        return frame

    def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of left and right eyes drawn on it
        '''

        left_eye_start_point = (left_eye_coords[0], left_eye_coords[1])
        left_eye_end_point = (left_eye_coords[2], left_eye_coords[3])
        right_eye_start_point = (right_eye_coords[0], right_eye_coords[1])
        right_eye_end_point = (right_eye_coords[2], right_eye_coords[3])
        thickness = 5
        color = (0, 210, 0)

        frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point,
                              color, thickness)
        frame = cv2.rectangle(frame, right_eye_start_point,
                              right_eye_end_point, color, thickness)

        return frame

    def draw_outputs(self, frame):
        '''
        Draws the inference outputs (bounding boxes of the face and both eyes and 
        the 3D head pose directions) of the four models onto the frames.

        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with all inference outputs drawn on it
        '''

        frame = self.draw_face_box(frame, self.face_coords)
        frame = self.draw_eyes_boxes(frame, self.left_eye_coords,
                                     self.right_eye_coords)

        frame_id = f'Batch id = {self.count_batch}'
        avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps'
        total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s'

        cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45,
                    (255, 86, 0), 1)
        cv2.putText(frame, avg_inference_speed, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)
        cv2.putText(frame, total_processing_time, (15, 45),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)

        return frame

    def run_inference(self, frame):
        '''
        Performs inference on the input video or image by passing it through all four
        models to get the desired coordinates for moving the mouse pointer.

        Args:
        frame = Input image, frame from video or camera feed

        Return:
        None
        '''

        self.input_feeder.load_data()

        for frame in self.input_feeder.next_batch():

            if self.input_feeder.frame_flag == True:
                log.info('[ Main ] Started processing a new batch')
                start_inference = time.time()
                self.face_coords, self.face_crop = self.face_model.predict(
                    frame)

                if self.face_coords == []:
                    log.info(
                        '[ Main ] No face detected.. Waiting for you to stare at the camera'
                    )
                    f.write('[ Error ] No face was detected')

                else:
                    self.head_pose_angles = self.head_pose_model.predict(
                        self.face_crop)
                    self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict(
                        self.face_crop)
                    self.x, self.y = self.gaze_model.predict(
                        self.left_eye_image, self.right_eye_image,
                        self.head_pose_angles)
                    log.info(
                        f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]'
                    )

                    batch_process_time = time.time() - start_inference
                    self.total_processing_time += batch_process_time
                    self.count_batch += 1
                    log.info(
                        f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n'
                    )

                    self.mouse_control.move(self.x, self.y)

                    if self.show_output:
                        self.draw_outputs(frame)

                    cv2.imshow('Computer Pointer Controller Output', frame)
                    self.inference_speed.append(self.count_batch /
                                                self.total_processing_time)
                    self.avg_inference_speed = sum(self.inference_speed) / len(
                        self.inference_speed)

                with open(os.path.join(self.output_path, 'outputs.txt'),
                          'w+') as f:
                    f.write('INFERENCE STATS\n')
                    f.write(
                        f'Total model initialization time : {self.model_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total model load time: {self.model_load_time:.2f}s\n'
                    )
                    f.write(
                        f'App initialization time: {self.app_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total processing time: {self.total_processing_time:.2f}s\n'
                    )
                    f.write(
                        f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n'
                    )
                    f.write(f'Batch count: {self.count_batch}\n\n')

                    f.write('LAST OUTPUTS\n')
                    f.write(f'Face coordinates: {self.face_coords}\n')
                    f.write(f'Left eye coordinates: {self.left_eye_coords}\n')
                    f.write(
                        f'Right eye coordinates: {self.right_eye_coords}\n')
                    f.write(f'Head pose angles: {self.head_pose_angles}\n')
                    f.write(
                        f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]'
                    )

            else:
                self.input_feeder.close()
                cv2.destroyAllWindows()

                log.info(
                    f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s'
                )
                log.info('[ Main ] Shutting down app...')
                log.info('[ Main ] Mouse controller app has been shut down.')
                break

        return
Ejemplo n.º 3
0
def main():

    args = get_args().parse_args()
    path_filender = args.input
    four_flags = args.flags_checker
    loger = logging.getLogger()
    feeder_in = None
    out_path = args.out_path

    if path_filender.lower() == "cam":
        feeder_in = InputFeeder("cam")
    else:
        if not os.path.isfile(path_filender):
            loger.error("The video was not found")
            exit(1)
        feeder_in = InputFeeder("video", path_filender)

    model_locations = {
        'FaceDetection': args.face_detection_model,
        'HeadPoseEstimation': args.head_pose_estimation_model,
        'FacialLandmarksDetection': args.facial_landmarks_detection_model,
        'GazeEstimation': args.gaze_estimation_model
    }

    for key_name in model_locations.keys():
        if not os.path.isfile(model_locations[key_name]):
            loger.error("The system cannot find the " + key_name + " xml file")
            exit(1)

    dt = FaceDetection(model_locations['FaceDetection'], args.device,
                       args.cpu_extension)
    pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device,
                            args.cpu_extension)
    ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'],
                                  args.device, args.cpu_extension)
    ge = GazeEstimation(model_locations['GazeEstimation'], args.device,
                        args.cpu_extension)

    cursor = MouseController('medium', 'fast')

    feeder_in.load_data()
    model_load_time_start = time.time()
    dt.load_model()
    pe.load_model()
    ld.load_model()
    ge.load_model()
    total_load_time = time.time() - model_load_time_start

    frame_counter = 0
    inference_time_start = time.time()
    for ret, frame in feeder_in.next_batch():
        if not ret:
            break
        frame_counter = frame_counter + 1
        if frame_counter % 1 == 0:
            cv2.imshow('video', cv2.resize(frame, (600, 600)))

        key = cv2.waitKey(60)

        face_detected, coords_face = dt.predict(frame, args.p_th)
        if type(face_detected) == int:
            loger.error("The system cannot detect any face.")
            if key == 27:
                break
            continue

        head_pose_output = pe.predict(face_detected)
        eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict(
            face_detected)
        coordi_update_pointer, coordi_gaze = ge.predict(
            eye_left_detect, eye_right_detect, head_pose_output)

        if (not len(four_flags) == 0):
            result_app = frame
            if 'fad' in four_flags:
                result_app = face_detected
            if 'hpe' in four_flags:
                cv2.putText(
                    result_app,
                    "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}".
                    format(head_pose_output[0], head_pose_output[1],
                           head_pose_output[2]), (5, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0)
            if 'fld' in four_flags:
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[0][0] - 4,
                               eye_coordinates_detect[0][1] - 4),
                              (eye_coordinates_detect[0][2] + 4,
                               eye_coordinates_detect[0][3] + 4),
                              (255, 255, 0), 4)
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[1][0] - 4,
                               eye_coordinates_detect[1][1] - 4),
                              (eye_coordinates_detect[1][2] + 4,
                               eye_coordinates_detect[1][3] + 4),
                              (255, 255, 0), 4)
            if 'gae' in four_flags:
                x = int(coordi_gaze[0] * 2)
                y = int(coordi_gaze[1] * 2)
                w = 150
                right_E = cv2.line(eye_right_detect, (x - w, y - w),
                                   (x + w, y + w), (51, 255, 153), 1)
                cv2.line(right_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                left_E = cv2.line(eye_left_detect, (x - w, y - w),
                                  (x + w, y + w), (51, 255, 153), 1)
                cv2.line(left_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                face_detected[
                    eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3],
                    eye_coordinates_detect[1][0]:eye_coordinates_detect[1]
                    [2]] = right_E
                face_detected[
                    eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3],
                    eye_coordinates_detect[0][0]:eye_coordinates_detect[0]
                    [2]] = left_E

            cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600)))

        if frame_counter % 5 == 0:
            cursor.move(coordi_update_pointer[0], coordi_update_pointer[1])
        if key == 27:
            break

    total_time = time.time() - inference_time_start
    total_time_for_inference = round(total_time, 1)
    fps = frame_counter / total_time_for_inference

    with open(out_path + 'stats.txt', 'w') as f:
        f.write('Inference time: ' + str(total_time_for_inference) + '\n')
        f.write('FPS: ' + str(fps) + '\n')
        f.write('Model load time: ' + str(total_load_time) + '\n')

    loger.error("The video stream is over...")
    cv2.destroyAllWindows()
    feeder_in.close()
Ejemplo n.º 4
0
def main():
    args = build_argparser().parse_args()
    device_name = args.device
    prob_threshold = args.prob_threshold
    logger_object = log.getLogger()

    # Initialize variables with the input arguments
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarkModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    # Instantiate model
    face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold)
    landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name,
                                              threshold=prob_threshold)
    head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name,
                                         threshold=prob_threshold)
    gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold)
    mouse_controller = MouseController('medium', 'fast')

    # Load Models and get time
    start_time = time.time()
    face_model.load_model()
    logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000))

    first_mark = time.time()
    landmark_model.load_model()
    logger_object.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000))

    second_mark = time.time()
    head_pose_model.load_model()
    logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000))

    third_mark = time.time()
    gaze_model.load_model()
    logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000))
    logger_object.error("All models are loaded successfully..")

    # Check extention of these unsupported layers
    face_model.check_model()
    landmark_model.check_model()
    head_pose_model.check_model()
    gaze_model.check_model()

    preview_flags = args.previewFlags
    input_filename = args.input
    output_path = args.output_path
    prob_threshold = args.prob_threshold

    if input_filename.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger_object.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger_object.error("Unable to find specified model file" + str(model_path))
            exit(1)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps,
                                (width, height), True)

    frame_counter = 0
    start_inf_time = time.time()
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_counter += 1
        key = cv2.waitKey(60)

        try:
            cropped_image, face_cords = face_model.predict(frame, prob_threshold)

            if type(cropped_image) == int:
                print("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image)
            pose_output = head_pose_model.predict(cropped_image)
            x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords)

            mouse_controller.move(x, y)
        except Exception as e:
            print(str(e) + " for frame " + str(frame_counter))
            continue

        image = cv2.resize(frame, (width, height))
        if not len(preview_flags) == 0:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                if len(preview_flags) != 1:
                    preview_frame = cropped_image
                    cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3)

            if 'hp' in preview_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format(
                        pose_output[0], pose_output[1], pose_output[2]),
                    (20, 40),
                    cv2.FONT_HERSHEY_DUPLEX,
                    1, (255, 0, 0), 3)

            if 'ge' in preview_flags:
                cv2.putText(
                    frame,
                    "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        x, y, z),
                    (15, 100),
                    cv2.FONT_HERSHEY_COMPLEX,
                    1, (0, 255, 0), 3)

            image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if frame_counter % 5 == 0:
            mouse_controller.move(x, y)

        if key == 27:
            break

    inference_time = round(time.time() - start_inf_time, 1)
    fps = int(frame_counter) / inference_time
    logger_object.error("counter {} seconds".format(frame_counter))
    logger_object.error("total inference time {} seconds".format(inference_time))
    logger_object.error("fps {} frame/second".format(fps))
    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f:
        f.write('inference time : ' + str(inference_time) + '\n')
        f.write('fps: ' + str(fps) + '\n')
        f.write('Models Loading: '+ str(load_total_time) + '\n')
    logger_object.error('Video stream ended')
    cv2.destroyAllWindows()
    input_feeder.close()
Ejemplo n.º 5
0
def main():

    args = argparser().parse_args()
    device = args.device
    input_feed = args.input

    log = logging.getLogger()

    model_paths = {
        'facedet': args.face_detection_model + 'xml',
        'faceldmdet': args.landmark_detection_model + 'xml',
        'headpose': args.pose_estimation_model + 'xml',
        'gaze': args.gaze_estimation_model + 'xml'
    }

    for mp in model_paths.keys():
        if not os.path.isfile(model_paths[mp]):
            print(model_paths[mp])
            print('Recheck file path and try again')
            log.error("Not a file")
            raise FileNotFoundError

    if input_feed == 'cam':

        feed = InputFeeder(input_type='cam')

    elif not os.path.isfile(input_feed):

        print('Recheck file path and try again')
        log.error("Unable to find specified video file")
        raise FileNotFoundError

    else:
        feed = InputFeeder(input_type='video', input_file=input_feed)

    facedet = FaceDetection(args.face_detection_model, args.device,
                            args.extensions, args.async_mode)
    faceldmdet = FacialLandmarksDetection(args.landmark_detection_model,
                                          args.device, args.extensions,
                                          args.async_mode)
    headpose = HeadPose(args.pose_estimation_model, args.device,
                        args.extensions, args.async_mode)
    gaze = GazeEstimation(args.gaze_estimation_model, args.device,
                          args.extensions, args.async_mode)

    try:
        log.info('Loading models...')
        facedet.load_model()
        faceldmdet.load_model()
        headpose.load_model()
        gaze.load_model()
        feed.load_data()
        log.info('Models loaded successfully!')
    except:
        log.error('One or more of the models failed to load..')
        exit(1)

    log.info('Initializing mouse controller')
    mouse = MouseController(precision='medium', speed='fast')

    for batch in feed.next_batch():
        face = facedet.predict(batch)
        eyes, eye_coords = faceldmdet.predict(face)
        pose = headpose.predict(face)

        point = gaze.predict(pose, eyes)
        #print('Gaze values = ', point[0], point[1])

        log.info('All inference complete')

        #print('view_inter = ', args.view_intermediate)
        if args.input == 'cam':
            point[0] = -point[0]

        mouse.move(point[0], point[1])
        if args.view_intermediate == True:
            visualize(pose, face, eye_coords, point)
def main():
    arg_parser = ArgParser()
    args = arg_parser.get_args()

    input_file = args.input

    # If input file defined then use it else use the webcam
    if input_file:
        if not os.path.isfile(input_file):
            log.error("Input file cannot be found")
            exit()
        input_feeder = InputFeeder("video", input_file)
    else:
        input_feeder = InputFeeder("cam")

    face_detection_model = FaceDetection(args.face_detection_model,
                                         args.device, args.extensions)
    face_detection_model.load_model()

    facial_landmarks_model = FacialLandmarksDetection(
        args.facial_landmark_detection_model, args.device, args.extensions)
    facial_landmarks_model.load_model()

    gaze_model = GazeEstimation(args.gaze_estimation_model, args.device,
                                args.extensions)
    gaze_model.load_model()

    head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model,
                                         args.device, args.extensions)
    head_pose_model.load_model()

    mouse_controller = MouseController('medium', 'fast')

    input_feeder.load_data()

    frame_count = 0
    total_face_detection_inference_time = 0
    total_facial_landmark_inference_time = 0
    total_head_pose_inference_time = 0
    total_gaze_estimation_inference_time = 0
    total_inference_time = 0
    for ret, frame in input_feeder.next_batch():

        if not ret:
            log.error("ret variable not found")
            break

        frame_count += 1

        if frame_count % args.mouse_update_interval == 0:
            cv2.imshow('Input', frame)

        key_pressed = cv2.waitKey(60)

        # Run inference on the face detection model
        start_time = time.time()
        cropped_face, face_coordinates = face_detection_model.predict(
            frame.copy(), args.probability_threshold)
        finish_time = time.time()
        total_face_detection_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # If no face detected get the next frame
        if len(face_coordinates) == 0:
            continue

        # Run inference on the facial landmark detection model
        start_time = time.time()
        results = facial_landmarks_model.predict(cropped_face.copy())
        finish_time = time.time()
        left_eye_coordinates = results[0]
        right_eye_coordinates = results[1]
        left_eye_image = results[2]
        right_eye_image = results[3]
        left_eye_crop_coordinates = results[4]
        right_eye_crop_coordinates = results[5]
        total_facial_landmark_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the head pose estimation model
        start_time = time.time()
        head_pose = head_pose_model.predict(cropped_face.copy())
        finish_time = time.time()
        total_head_pose_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the gaze estimation model
        start_time = time.time()
        new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict(
            left_eye_image, right_eye_image, head_pose)
        finish_time = time.time()
        total_gaze_estimation_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        if frame_count % args.mouse_update_interval == 0:
            log.info("Mouse controller new coordinates: x = {}, y = {}".format(
                new_mouse_x_coordinate, new_mouse_y_coordinate))
            mouse_controller.move(new_mouse_x_coordinate,
                                  new_mouse_y_coordinate)

            # Optional visualization configuration:
            if args.show_detected_face:
                showDetectedFace(frame, face_coordinates)

            if args.show_head_pose:
                showHeadPose(frame, head_pose)

            if args.show_facial_landmarks:
                showFacialLandmarks(cropped_face, left_eye_crop_coordinates,
                                    right_eye_crop_coordinates)

            if args.show_gaze_estimation:
                showGazeEstimation(frame, right_eye_coordinates,
                                   left_eye_coordinates, gaze_vector,
                                   cropped_face, face_coordinates)

        # Break if escape key pressed
        if key_pressed == 27:
            log.warning("Keyboard interrupt triggered")
            break

    # Release the capture and destroy any OpenCV windows
    cv2.destroyAllWindows()
    input_feeder.close()
    log.info("Average face detection inference time: {} seconds".format(
        total_face_detection_inference_time / frame_count))
    log.info(
        "Average facial landmark detection inference time: {} seconds".format(
            total_facial_landmark_inference_time / frame_count))
    log.info("Average head pose estimation inference time: {} seconds".format(
        total_head_pose_inference_time / frame_count))
    log.info("Average gaze estimation inference time: {} seconds".format(
        total_gaze_estimation_inference_time / frame_count))
    log.info("Average total inference time: {} seconds".format(
        total_inference_time / frame_count))
Ejemplo n.º 7
0
def test_run(args):
    logging.getLogger().setLevel(logging.INFO)
    feeder = None
    activate_frame_count = 10
    logging.warning("Running default value activate frame count = 10")
    if args.input_type == 'video' or args.input_type == 'image':
        feeder = InputFeeder(args.input_type, args.input)
        if args.input == '../bin/demo.mp4':
            logging.warning("Running default setting and input")
    elif args.input_type == 'webcam':
        feeder = InputFeeder(args.input_type, args.input)
    else:
        logging.error("Input not found")
        exit(1)

    mouse_controller = MouseController(args.precision, args.speed)

    feeder.load_data()
    start_time = 0

    face_model_load_time = 0
    start_time = time.time()
    face_model = FaceDetection(args.face, args.device, args.cpu_extension)
    face_model.load_model()
    face_model_load_time = time.time() - start_time
    logging.info("Face Detection Model Loaded...")

    head_pose_estimation_load_time = 0
    start_time = time.time()
    head_pose_estimation = HeadPoseEstimation(args.headpose, args.device,
                                              args.cpu_extension)
    head_pose_estimation.load_model()
    head_pose_estimation_load_time = time.time() - start_time
    logging.info("Head Pose Detection Model Loaded...")

    facial_landmarks_detection_load_time = 0
    start_time = time.time()
    facial_landmarks_detection = FacialLandmarksDetection(
        args.landmarks, args.device, args.cpu_extension)
    facial_landmarks_detection.load_model()
    facial_landmarks_detection_load_time = time.time() - start_time
    logging.info("Facial Landmark Detection Model Loaded...")

    gaze_model_load_time = 0
    start_time = time.time()
    gaze_model = GazeEstimation(args.gazeestimation, args.device,
                                args.cpu_extension)
    gaze_model.load_model()
    gaze_model_load_time = time.time() - start_time
    logging.info("Gaze Estimation Model Loaded...")

    frame_count = 0

    total_face_model_inference_time = 0
    total_head_pose_estimation_inference_time = 0
    total_facial_landmarks_detection_inference_time = 0
    total_gaze_model_inference_time = 0
    start_time = 0
    for frame in feeder.next_batch():
        if frame is None:
            break
        frame_count += 1
        key = cv2.waitKey(60)

        start_time = time.time()
        first_face_box, first_face = face_model.predict(frame.copy())
        total_face_model_inference_time = total_face_model_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        head_pose_output = head_pose_estimation.predict(first_face_box.copy())
        total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        left_eye, right_eye, eye_coords = facial_landmarks_detection.predict(
            first_face_box.copy())
        total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        move_to_coors_mouse = gaze_model.predict(left_eye, right_eye,
                                                 head_pose_output)
        total_gaze_model_inference_time = total_gaze_model_inference_time + (
            time.time() - start_time)

        if frame_count % activate_frame_count == 0 and (args.flag == "3"
                                                        or args.flag == "4"):
            mouse_controller.move(move_to_coors_mouse[0],
                                  move_to_coors_mouse[1])
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        if key == 27:
            break

        if args.flag == "1":
            cv2.rectangle(frame, (first_face[0], first_face[1]),
                          (first_face[2], first_face[3]), (255, 0, 0))
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        elif args.flag == "2":
            cv2.rectangle(facial_landmarks_detection.image,
                          (eye_coords[0], eye_coords[1]),
                          (eye_coords[2], eye_coords[3]), (255, 0, 0))
            cv2.imshow('video', facial_landmarks_detection.image)
            key = cv2.waitKey(60)
        elif args.flag == "3":
            if frame_count == 1:
                logging.info("Printing mouse coors: ")
            logging.info(move_to_coors_mouse)

    #Print Report
    if args.flag == "0":
        print('------------- BEGIN REPORT -------------')
        avg_inference_face_model = total_face_model_inference_time / frame_count
        avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count
        avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count
        avg_inference_gaze_model = total_gaze_model_inference_time / frame_count

        print("Face Detection Model Load Time: ", args.face)
        print("Loading time: ", face_model_load_time)
        print("Inference time: ", avg_inference_face_model)

        print("Head Pose Detection Model: ", args.headpose)
        print("Loading time: ", head_pose_estimation_load_time)
        print("Inference time:", avg_inference_headpose)

        print("Facial Landmark Detection Model Load Time: ", args.landmarks)
        print("Loading time: ", facial_landmarks_detection_load_time)
        print("Inference time:", avg_inference_facial_landmark)

        print("Gaze Estimation Model Load Time: ", args.gazeestimation)
        print("Loading time: ", gaze_model_load_time)
        print("Inference time:", avg_inference_gaze_model)

        print('------------- END REPORT -------------')
Ejemplo n.º 8
0
def main():
    """
    Load inference networks, stream video to network,
    and output stats and video.
    :return: None
    """

    # Logger init
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")

    # Get command line args
    args = get_arg()

    #Load Preferencies
    with open(args.config_file, "r") as yamlfile:
        cfg = yaml.load(yamlfile, Loader=yaml.FullLoader)
    models = cfg['models']
    input_source = args.input
    video_path = cfg['video_path']
    face_model = FaceDetection(models['face_detection'])
    head_pose_model = HeadPoseEstimation(models['head_pose_estimation'])
    facial_landmarks_model = FacialLandmarksDetection(models['facial_landmarks_detection'])
    gaze_estimation_model = GazeEstimation(models['gaze_estimation'])

    # Initialise the MouseController
    mouse_contr = MouseController("low","fast")

    # Load the models and log timing
    start_time = time.time()
    face_model.load_model(args.device)
    logging.info("Load Face Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    facial_landmarks_model.load_model(args.device)
    logging.info("Load Facial Landmarks Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    head_pose_model.load_model(args.device)
    logging.info("Load Head Pose Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    gaze_estimation_model.load_model(args.device) 
    logging.info("Load Gaze Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    # Get and open video or camera capture
    #input_feed = InputFeeder('video', args.input)
    #input_feed.load_data()

    input_feed = InputFeeder(input_type=input_source, input_file=video_path)
    input_feed.load_data()

    if not input_feed.cap.isOpened():
        log.critical('Error opening input, check --video_path parameter')
        sys.exit(1)
    # FPS = input_feed.get_fps()

    # Grab the shape of the input 
    # width = input_feed.get_width()
    # height = input_feed.get_height()

    # init scene variables
    frame_count = 0

    ### Loop until stream is over ###
    facedetect_infer_time = 0
    landmark_infer_time = 0
    headpose_infer_time = 0
    gaze_infer_time = 0
    while True:
        # Read the next frame
        try:
            frame = next(input_feed.next_batch())
        except StopIteration:
            break

        if frame is None:
            break


        key_pressed = cv2.waitKey(60)
        frame_count += 1
        input_height, input_width, _ = frame.shape
        logging.info("frame {count} size {w}, {h}".format(count= frame_count, w = input_width, h =input_height)) 
        
        # face detection
        p_frame = face_model.preprocess_input(frame)
        start_time = time.time()
        fnoutput = face_model.predict(p_frame)
        facedetect_infer_time += time.time() - start_time
        out_frame,fboxes = face_model.preprocess_output(fnoutput,frame,args.overlay, args.prob_threshold)
        
        #for each face
        for fbox in fboxes:

            face = frame[fbox[1]:fbox[3],fbox[0]:fbox[2]]
            p_frame = facial_landmarks_model.preprocess_input(face)
            
            start_time = time.time()
            lmoutput = facial_landmarks_model.predict(p_frame)
            landmark_infer_time += time.time() - start_time
            out_frame,left_eye_point,right_eye_point = facial_landmarks_model.preprocess_output(lmoutput, fbox, out_frame,args.overlay, args.prob_threshold)

            # get head pose estimation
            p_frame  = head_pose_model.preprocess_input(face)
            start_time = time.time()
            hpoutput = head_pose_model.predict(p_frame)
            headpose_infer_time += time.time() - start_time
            out_frame, headpose_angels = head_pose_model.preprocess_output(hpoutput,out_frame, face,fbox,args.overlay, args.prob_threshold)

            # get gaze  estimation
            out_frame, left_eye, right_eye  = gaze_estimation_model.preprocess_input(out_frame,face,left_eye_point,right_eye_point,args.overlay)
            start_time = time.time()
            geoutput = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels)
            gaze_infer_time += time.time() - start_time
            out_frame, gazevector = gaze_estimation_model.preprocess_output(geoutput,out_frame,fbox, left_eye_point,right_eye_point,args.overlay, args.prob_threshold)

            cv2.imshow('im', out_frame)
            
            if(args.mouse_move):
                logging.info("mouse move vector : x ={}, y={}".format(gazevector[0], gazevector[1])) 
                mouse_contr.move(gazevector[0], gazevector[1])
            
            #use only first detected face in the frame
            break
        
        # Break if escape key pressed
        if key_pressed == 27:
            break

    #logging inference times
    if(frame_count>0):
        logging.info("***** Models Inference time *****") 
        logging.info("Face Detection:{:.1f}ms".format(1000* facedetect_infer_time/frame_count))
        logging.info("Facial Landmarks Detection:{:.1f}ms".format(1000* landmark_infer_time/frame_count))
        logging.info("Headpose Estimation:{:.1f}ms".format(1000* headpose_infer_time/frame_count))
        logging.info("Gaze Estimation:{:.1f}ms".format(1000* gaze_infer_time/frame_count))


    # Release the capture and destroy any OpenCV windows
    input_feed.close()
    cv2.destroyAllWindows()
def main():

    try:
        args = build_argparser().parse_args()

        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler("computer-pointer-controller.log"),
                logging.StreamHandler()
            ])

        print_output_frame = args.print_output_frame

        logger = logging.getLogger()

        input_file_path = args.input
        feeder = None

        if input_file_path.lower() == "CAM":
            feeder = InputFeeder("cam")
        else:
            if not os.path.isfile(input_file_path):
                logger.error("Unable to find specified video file")
                exit(1)
            feeder = InputFeeder("video", input_file_path)

        mc = MouseController('low', 'fast')
        feeder.load_data()

        modelPathDict = {
            'FaceDetectionModel': args.face,
            'FacialLandmarksDetectionModel': args.landmark,
            'GazeEstimationModel': args.gazeestimation,
            'HeadPoseEstimationModel': args.headpose
        }

        for fileNameKey in modelPathDict.keys():
            if not os.path.isfile(modelPathDict[fileNameKey] + '.xml'):
                logger.error("Unable to find specified " + fileNameKey +
                             " xml file")
                exit(1)

        logging.info("============== Models Load time ===============")
        face_detection = FaceDetection(args.face, args.device,
                                       args.prob_threshold, args.cpu_extension)
        start_time = time.time()
        face_detection.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        landmarks_detection = FacialLandmarksDetection(args.landmark,
                                                       args.device,
                                                       args.cpu_extension)
        start_time = time.time()
        landmarks_detection.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        gaze_estimation = GazeEstimation(args.gazeestimation, args.device,
                                         args.cpu_extension)
        start_time = time.time()
        gaze_estimation.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        headpose_estimation = HeadPoseEstimation(args.headpose, args.device,
                                                 args.cpu_extension)
        start_time = time.time()
        headpose_estimation.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        logging.info("==============  End =====================")

        frame_count = 0
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0

        for ret, frame in feeder.next_batch():
            if not ret:
                break
            frame_count += 1
            key = cv2.waitKey(60)

            start_time = time.time()
            cropped_face, face_coords = face_detection.predict(frame.copy())
            fd_infertime += time.time() - start_time

            if len(cropped_face) == 0:
                logger.error("Unable to detect the face.")
                continue

            start_time = time.time()
            headpose_out = headpose_estimation.predict(cropped_face.copy())
            hp_infertime += time.time() - start_time

            start_time = time.time()
            left_eye, right_eye, eye_coords = landmarks_detection.predict(
                cropped_face.copy())
            lm_infertime += time.time() - start_time

            start_time = time.time()
            new_mouse_coord, gaze_vector = gaze_estimation.predict(
                left_eye, right_eye, headpose_out)
            ge_infertime += time.time() - start_time

            if print_output_frame:
                preview_frame = frame.copy()
                if 'fd' in print_output_frame:
                    preview_frame = cropped_face
                    cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                                  (face_coords[2], face_coords[3]),
                                  (255, 0, 0), 3)

                if 'fl' in print_output_frame:
                    cv2.rectangle(cropped_face,
                                  (eye_coords[0][0], eye_coords[0][1]),
                                  (eye_coords[0][2], eye_coords[0][3]),
                                  (0, 255, 0), 2)
                    cv2.rectangle(cropped_face,
                                  (eye_coords[1][0], eye_coords[1][1]),
                                  (eye_coords[1][2], eye_coords[1][3]),
                                  (0, 255, 0), 2)

                if 'hp' in print_output_frame:
                    cv2.putText(
                        cropped_face,
                        "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                        format(headpose_out[0], headpose_out[1],
                               headpose_out[2]), (0, 20),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.25, (0, 0, 0), 1)

                    face = frame[face_coords[1]:face_coords[3],
                                 face_coords[0]:face_coords[2]]
                    xmin, ymin, _, _ = face_coords
                    face_center = (xmin + face.shape[1] / 2,
                                   ymin + face.shape[0] / 2, 0)
                    headpose_estimation.draw_axes(frame, face_center,
                                                  headpose_out[0],
                                                  headpose_out[1],
                                                  headpose_out[2])

                if 'ge' in print_output_frame:

                    cropped_h, cropped_w = cropped_face.shape[:2]
                    arrow_length = 0.3 * cropped_h

                    gaze_arrow_x = gaze_vector[0] * arrow_length
                    gaze_arrow_y = -gaze_vector[1] * arrow_length

                    cv2.arrowedLine(cropped_face,
                                    (eye_coords[0][0], eye_coords[0][1]),
                                    (int(eye_coords[0][2] + gaze_arrow_x),
                                     int(eye_coords[0][3] + gaze_arrow_y)),
                                    (0, 255, 0), 2)
                    cv2.arrowedLine(cropped_face,
                                    (eye_coords[1][0], eye_coords[1][1]),
                                    (int(eye_coords[1][2] + gaze_arrow_x),
                                     int(eye_coords[1][3] + gaze_arrow_y)),
                                    (0, 255, 0), 2)

                    #frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face

                if len(preview_frame) != 0:
                    img_hor = np.hstack((cv2.resize(preview_frame, (800, 800)),
                                         cv2.resize(frame, (800, 800))))
                else:
                    img_hor = cv2.resize(frame, (800, 800))

                cv2.imshow("Monitor", img_hor)

            if frame_count % 5 == 0:
                mc.move(new_mouse_coord[0], new_mouse_coord[1])

            if key == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        logger.info("Video stream ended...")
        cv2.destroyAllWindows()
        feeder.close()

    except Exception as ex:
        logging.exception("Error in inference")
        logging.exception("Exception type:")
        logging.exception(type(ex))
        logging.exception("Exception args:")
        logging.exception(ex.args)
        logging.exception("Exception:")
        logging.exception(ex)