Esempio n. 1
0
def run_inference(args):

    feed = InputFeeder(input_type='video', input_file=args.input)
    feed.load_data()
    for batch in feed.next_batch():
        cv2.imshow("Output", cv2.resize(batch, (500, 500)))
        key = cv2.waitKey(60)

        if (key == 27):
            break

        # getting face
        faceDetection = FaceDetection(model_name=args.face_detection_model)
        faceDetection.load_model()
        face = faceDetection.predict(batch)

        # getting eyes
        facialLandmarksDetection = FacialLandmarksDetection(
            args.facial_landmarks_detection_model)
        facialLandmarksDetection.load_model()
        left_eye, right_eye = facialLandmarksDetection.predict(face)

        # getting head pose angles
        headPoseEstimation = HeadPoseEstimation(
            args.head_pose_estimation_model)
        headPoseEstimation.load_model()
        head_pose = headPoseEstimation.predict(face)
        print("head pose angles: ", head_pose)

        # get mouse points
        gazeEstimation = GazeEstimation(args.gaze_estimation_model)
        gazeEstimation.load_model()
        mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose)
        print("gaze  output: ", mouse_coords)
    feed.close()
Esempio n. 2
0
def main():    
    args = build_argparser().parse_args()        
    inputFile = args.input
    inputFeeder = None
    if inputFile.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFile):
            print("Unable to find input file")
            exit(1)        
        inputFeeder = InputFeeder("video",inputFile)
    
    start_model_loading = time.time()    
    detect,landmark,gaze,pose=init_models(args)
    inputFeeder.load_data()
    LoadModel(detect, landmark, gaze, pose)
    model_loading_time = time.time() - start_model_loading    
    frame_count,inference_time = inference_frame(detect,pose,landmark,gaze,inputFeeder,args)
    fps = frame_count / inference_time

    print("video is complete!")
    print(f'Model took {model_loading_time} s to load')
    print(f'Inference time of the model is: {inference_time} s')
    print(f'Average inference time of the model is : {inference_time/frame_count} s')
    print(f'FPS is {fps/5} frame/second')

    cv2.destroyAllWindows()
    inputFeeder.close()
Esempio n. 3
0
def main():
    # Load parameters
    params = get_args()

    mouse_prec = params['mouse_prec']
    mouse_speed = params['mouse_speed']
    mouse = MouseController(mouse_prec, mouse_speed)
    models = load_models(params)

    # Load input feed
    input_type = params['input_type']
    if input_type=='cam':
        input_file = None
    else:
        input_file = params['input_file_path']

    feed=InputFeeder(input_type=input_type, input_file=input_file)
    feed.load_data()
    for batch in feed.next_batch():
        if batch is not None:
            image, pos = main_loop(batch, models)
            cv2.imshow('frame', image)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            mouse.move(pos[0], pos[1])
            # break
        else:
            break
    feed.close()
Esempio n. 4
0
def process_video(file_input, file_output, display_intermediate_output):
    if file_input is None:
        feed = InputFeeder(input_type='cam')
    else:
        feed = InputFeeder(input_type='video', input_file=file_input)

    feed.load_data()

    w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(feed.cap.get(cv2.CAP_PROP_FPS))
    out = cv2.VideoWriter(file_output, cv2.VideoWriter_fourcc(*'avc1'), fps, (w, h), True)

    frame_counter = 0
    for batch in feed.next_batch():
        frame_counter += 1
        result, frame = process_single_frame(batch, display_intermediate_output)
        out.write(frame)

        logging.debug(f'Frame #{frame_counter} result: {result}')
        if type(result) == str and result == 'No face detected':
            logging.warning('Frame {}: No face detected', frame_counter)

        if mouse_controller is not None:
            mouse_controller.move(result[0], result[1])

    out.release()
    feed.close()
Esempio n. 5
0
def process_image(file_path, file_output, display_intermediate_output):
    feed = InputFeeder(input_type='image', input_file=file_path)
    feed.load_data()
    for batch in feed.next_batch():
        result, image = process_single_frame(batch, display_intermediate_output)
        # cv2.imshow('demo image', image)
        cv2.imwrite(file_output, image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    feed.close()
Esempio n. 6
0
def main(args):

    mouse_controller = MouseController('medium', 'fast')

    print("Model Loading..")

    face_detection = Model_FaceDetection(args.face_detection, args.device)
    face_landmark = Model_FacialLandmarksDetection(args.face_landmark, args.device)
    head_pose = Model_HeadPoseEstimation(args.head_pose, args.device)
    gaze_estimation = Model_GazeEstimation(args.gaze_estimation, args.device)
    
    print("Model loaded successfully")

    input_feeder  = InputFeeder(input_type='video', input_file=args.input)
    input_feeder.load_data()

    face_detection.load_model()
    head_pose.load_model()
    face_landmark.load_model()
    gaze_estimation.load_model()

    for frame in input_feeder.next_batch():
        try:
            frame.shape
        except Exception as err:
            break

        key = cv2.waitKey(60)

        face,face_coord = face_detection.predict(frame.copy(), args.prob_threshold)

        if type(face)==int:
            print("Unable to detect the face.")
            if key==27:
                break
            continue
        
        headPose = head_pose.predict(face.copy())
        
        left_eye, right_eye, eye_coord  = face_landmark.predict(face.copy())
        
        mouse_coord, gaze_vector = gaze_estimation.predict(left_eye, right_eye, headPose)
        
        cv2.imshow('video',frame)
        mouse_controller.move(mouse_coord[0], mouse_coord[1])


    input_feeder.close()
    cv2.destroyAllWindows()
def process_video(input_video, video_output, visualize):
    if input_video is None:
        feed = InputFeeder(input_type='cam')
    else:
        feed = InputFeeder(input_type='video', input_file=input_video)

    feed.load_data()

    w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(feed.cap.get(cv2.CAP_PROP_FPS))
    fps = int(fps / 4)
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(video_output, fourcc, fps, (w, h), True)

    frame_counter = 0
    for frame in feed.next_batch():
        if frame is not None:
            frame_counter += 1
            key = cv2.waitKey(10)
            result, output_frame = process_frame(frame, visualize)

            out.write(output_frame)

            print("Frame: {} result: {}".format(frame_counter, result))
            logger.info("Frame: {} result: {}".format(frame_counter, result))

            esc_code = 27
            if key == esc_code:
                break

            if mouse_controller is not None:
                try:
                    mouse_controller.move(result[0], result[1])

                except Exception as e:
                    print("Mouse controller exception:\n", e)
                    logger.info("Mouse controller exception:{}".format(e))

        else:
            break

    cv2.destroyAllWindows()
    out.release()
    feed.close()
    print("Saved the video")
    logger.info("Saved the video")
Esempio n. 8
0
def main(model_dir, device, precision, input_type, input_file, inspect):
    mouse_controller = MouseController("medium", "fast")
    input_feeder = InputFeeder(input_type=input_type, input_file=input_file)
    input_feeder.load_data()

    gaze_detect = GazeDetect(model_dir=model_dir, device=device, precision=precision)
    gaze_detect.load_model()

    for image in input_feeder.next_batch():
        with Timer() as t:
            outputs = gaze_detect.predict(image)
        if outputs is not None:
            angle_y_fc, angle_p_fc, angle_r_fc = outputs.reshape(3)
            mouse_controller.move(-angle_y_fc, angle_p_fc)
            print(
                f"Mouse move x: {-angle_y_fc}, y: {angle_p_fc}, execution time: {t.elapsed}"
            )
def start_pipeline(cla, codec):
    """
    Initializes feeds inputs to models, moving the mouse cursor based on the final gaze estimation.
    :param cla: Command line arguments for configuring the pipeline.
    :param codec: Depending on the platform this is run on, OpenCV requires a codec to be specified. Supply it here.
    :return: None
    """
    preview_flags = cla.preview_flags

    logger = logging.getLogger()
    input_file_path = cla.input

    if input_file_path.lower() == "cam":
        in_feeder = InputFeeder("cam")
    elif not os.path.isfile(input_file_path):
        # top = os.path.dirname(os.path.realpath(__file__))
        # walktree(top, visit_file)
        logger.error("Cannot locate video file provided. Exiting..")
        sys.exit(1)
    else:
        in_feeder = InputFeeder("video", input_file_path)

    start_model_load_time = time.time()
    fdm, fldm, hpem, gem = prep_models(cla)
    total_model_load_time = time.time() - start_model_load_time

    mc = None
    if not cla.is_benchmark:
        mc = MouseController('medium', 'fast')

    in_feeder.load_data()

    fps, total_inference_time, total_time = handle_input_feed(
        logger, preview_flags, fdm, fldm, hpem, gem, mc, in_feeder,
        cla.frame_out_rate, codec, cla.output_path)

    with open(os.path.join(cla.output_path, 'stats.txt'), 'w') as f:
        f.write("Total inference time, " + str(total_inference_time) + '\n')
        f.write("FPS, " + str(fps) + '\n')
        f.write("Total model load time, " + str(total_model_load_time) + '\n')
        f.write("Total time, " + str(total_time) + '\n')

    logger.error("Video stream ended...")
    cv2.destroyAllWindows()
    in_feeder.close()
Esempio n. 10
0
def main():
    """
    Load the network and parse the output.
    :return: None
    """
    # Grab command line args
    args = build_argparser().parse_args()
        
    start_time = time.time()
    face_detector = FaceDetect(model_name=args.face, device=args.device, output=args.output)
    face_detector.load_model()
    print("Time taken to load face detection model (in seconds):", time.time()-start_time)

    start_time = time.time()
    eyes_detector = EyesDetect(model_name=args.eyes, device=args.device, output=args.output)
    eyes_detector.load_model()
    print("Time taken to load landmark detection model (in seconds):", time.time()-start_time)

    start_time = time.time()
    angle_detector = AngleDetect(model_name=args.angle, device=args.device)
    angle_detector.load_model()
    print("Time taken to load head pose estimation model (in seconds):", time.time()-start_time)

    start_time = time.time()
    gaze_detector = GazeDetect(model_name=args.gaze, device=args.device)
    gaze_detector.load_model()
    print("Time taken to load gaze estimation model (in seconds):", time.time()-start_time)

    mouse_controller = MouseController('medium','medium')
    
    feed=InputFeeder(input_type=args.video, input_file=args.input)
    feed.load_data()
    for batch in feed.next_batch():
        if batch is None: # catch last frame
            break
        face = face_detector.predict(batch)
        left_eye, right_eye = eyes_detector.predict(face)
        angles = angle_detector.predict(face)
        x, y = gaze_detector.predict(left_eye, right_eye, angles)
        mouse_controller.move(x, y)

    feed.close()
def setup(args):
    global input_path, output_path, device, cpu_extension, prob_threshold, flags, mouse_controller, feeder, video_writer, model_dict, model_loading_total_time
    model_args = [
        args.face_detection_model,
        args.facial_landmarks_detection_model,
        args.head_pose_estimation_model,
        args.gaze_estimation_model,
    ]
    model_class = [
        Model_FaceDetection,
        Model_FacialLandMarkDetection,
        Model_HeadPoseEstimation,
        Model_GazeEstimation,
    ]
    input_path = input_path_generator(args.input) if args.input != "CAM" else None
    output_path = output_path_generator(args.output)
    device = args.device
    cpu_extension = args.cpu_extension
    prob_threshold = args.prob_threshold
    flags = args.flags
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    mouse_controller = MouseController("low", "fast")
    if input_path:
        if input_path.endswith(".jpg"):
            feeder = InputFeeder("image", input_path)
        else:
            feeder = InputFeeder("video", input_path)
    else:
        feeder = InputFeeder("cam")
    feeder.load_data()
    fps = feeder.fps()
    initial_w, initial_h, video_len = feeder.frame_initials_and_length()
    video_writer = cv2.VideoWriter(
        os.path.join(output_path, "output_video.mp4"),
        cv2.VideoWriter_fourcc(*"avc1"),
        fps / 10,
        (initial_w, initial_h),
        True,
    )
    model_dict, model_loading_total_time = generate_model_dict(model_args, model_class)
    return
Esempio n. 12
0
def main(args):
    inference = Inference(args.model)
    inference.load_model()

    input = args.input

    if input == 0:
        input_feeder = InputFeeder('cam', input)
    elif input.endswith('.jpg') or input.endswith('.jpeg') or input.endswith(
            '.bmp'):
        input_feeder = InputFeeder('image', input)
        is_image = True
    else:
        input_feeder = InputFeeder('video', input)

    input_feeder.load_data()

    if is_image:
        outputs = inference.predict(input_feeder.cap)

        inference.preprocess_output(outputs)
        return 0

    frames = 0
    for ret, frame in input_feeder.next_batch():

        if not ret:
            break

        frames += 1

        key = cv2.waitKey(60)
        if key == 27:
            break

        outputs = inference.predict(frame)

        inference.preprocess_output(outputs)

    input_feeder.close()
def main(args):
    feed = InputFeeder(input_type=args.it, input_file=args.i)

    face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p))
    face_model.load_model()

    landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c)
    landmarks_model.load_model()

    headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c)
    headpose_model.load_model()

    gaze_model = GazeEstimationModel(args.gem, args.d, args.c)
    gaze_model.load_model()

    mouse = MouseController("medium", "fast")

    feed.load_data()
    for batch in feed.next_batch():
        # try:
        cropped_face, coords, _ = face_model.predict(batch)
        cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]),
                      (255, 0, 0), 2)

        left_eye, right_eye, eyes_coords, _ = landmarks_model.predict(
            cropped_face)

        head_pose_angles, _ = headpose_model.predict(cropped_face)
        x, y, z, _ = gaze_model.predict(left_eye, right_eye, head_pose_angles,
                                        cropped_face, eyes_coords)

        mouse.move(x, y)

        cv2.imshow("img", batch)
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
    # except:
    #     print("Frame without prediction. Error: ", sys.exc_info()[0])
    #     log.error(sys.exc_info()[0])
    feed.close()
def main():

    args = build_argparser().parse_args()
    visual = args.visual_flag
    log = logging.getLogger()
    input_source = args.input_source

    try:
        video_path = args.input_path
    except Exception as e:
        video_path = None
    feed = None
    if input_source.lower() == 'cam':
        feed = InputFeeder('cam')
    elif input_source.lower() == 'video' and os.path.isfile(video_path):
        feed = InputFeeder('video', video_path)
    else:
        log.error('Wrong input feed. (check the video path).')
        exit(1)

    fd = Model_Face(args.face_detection_model, args.device, args.extension)
    hp = Model_HeadPose(args.head_pose_model, args.device, args.extension)
    fl = Model_Faciallandmark(args.facial_landmarks_model, args.device,
                              args.extension)
    ga = Model_Gaze(args.gaze_model, args.device, args.extension)
    ### You can specify the value of precision and speed directly.
    ##  OR
    ## 'high'(100),'low'(1000),'medium','low-med' - precision
    ## 'fast'(1), 'slow'(10), 'medium', 'slow-med' - speed
    #     mouse = MouseController('low-med', 'slow-med')
    mouse = MouseController(500, 4)

    feed.load_data()

    # load models
    fd.load_model()
    hp.load_model()
    fl.load_model()
    ga.load_model()
    count = 0
    for ret, frame in feed.next_batch():
        if not ret:
            break
        count += 1
        if count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))
        key = cv2.waitKey(60)
        frame_cp = frame.copy()
        face, face_position = fd.predict(frame_cp, args.threshold)
        if type(face) == int:
            log.error('Prediction Error: Cant find face.')
            if key == 27:
                break
            continue
        face_cp = face.copy()
        hp_output = hp.predict(face_cp)
        left_eye, right_eye, facial = fl.predict(face_cp)
        #         print('left',left_eye,'\n','right',right_eye,'\n')
        mouse_coord, gaze_vector = ga.predict(left_eye, right_eye, hp_output)

        if (not len(visual) == 0):
            visual_frame = frame.copy()
            ### Visual FLAGS
            # face detection
            if 'fd' in visual:
                visual_frame = face
            # Head pose
            if 'hp' in visual:
                cv2.putText(
                    visual_frame,
                    "Yaw: {:.2f} Pitch: {:.2f} Roll: {:.2f}".format(
                        hp_output[0], hp_output[1], hp_output[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.3, (0, 255, 50), 1)
            # Facial landmarks
            if 'fl' in visual:
                cv2.rectangle(face, (facial[0][0] - 10, facial[0][1] - 10),
                              (facial[0][2] + 10, facial[0][3] + 10),
                              (255, 0, 0), 3)
                cv2.rectangle(face, (facial[1][0] - 10, facial[1][1] - 10),
                              (facial[1][2] + 10, facial[1][3] + 10),
                              (255, 0, 0), 3)
            # Gaze estimation
            if 'ga' in visual:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 255, 0), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 255, 0), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2)
                face[facial[0][1]:facial[0][3], facial[0][0]:facial[0][2]] = le
                face[facial[1][1]:facial[1][3], facial[1][0]:facial[1][2]] = re
            cv2.namedWindow('Visualization', cv2.WINDOW_AUTOSIZE)
            cv2.moveWindow('Visualization', 900, 900)
            cv2.imshow('Visualization', cv2.resize(visual_frame, (500, 500)))
            if args.visual_save.lower() == 'y':
                if count % 10 == 0:
                    cv2.imwrite(str(count) + '_visual.jpg', visual_frame)
        if count % 5 == 0:
            mouse.move(mouse_coord[0], mouse_coord[1])
        if key == 27:
            break

    log.error('INFO: Ended!')
    cv2.destroyAllWindows()
    feed.close()
Esempio n. 15
0
def main():
    args = build_argparser().parse_args()
    device_name = args.device
    prob_threshold = args.prob_threshold
    logger_object = log.getLogger()

    # Initialize variables with the input arguments
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarkModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    # Instantiate model
    face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold)
    landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name,
                                              threshold=prob_threshold)
    head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name,
                                         threshold=prob_threshold)
    gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold)
    mouse_controller = MouseController('medium', 'fast')

    # Load Models and get time
    start_time = time.time()
    face_model.load_model()
    logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000))

    first_mark = time.time()
    landmark_model.load_model()
    logger_object.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000))

    second_mark = time.time()
    head_pose_model.load_model()
    logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000))

    third_mark = time.time()
    gaze_model.load_model()
    logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000))
    logger_object.error("All models are loaded successfully..")

    # Check extention of these unsupported layers
    face_model.check_model()
    landmark_model.check_model()
    head_pose_model.check_model()
    gaze_model.check_model()

    preview_flags = args.previewFlags
    input_filename = args.input
    output_path = args.output_path
    prob_threshold = args.prob_threshold

    if input_filename.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger_object.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger_object.error("Unable to find specified model file" + str(model_path))
            exit(1)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps,
                                (width, height), True)

    frame_counter = 0
    start_inf_time = time.time()
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_counter += 1
        key = cv2.waitKey(60)

        try:
            cropped_image, face_cords = face_model.predict(frame, prob_threshold)

            if type(cropped_image) == int:
                print("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image)
            pose_output = head_pose_model.predict(cropped_image)
            x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords)

            mouse_controller.move(x, y)
        except Exception as e:
            print(str(e) + " for frame " + str(frame_counter))
            continue

        image = cv2.resize(frame, (width, height))
        if not len(preview_flags) == 0:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                if len(preview_flags) != 1:
                    preview_frame = cropped_image
                    cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3)

            if 'hp' in preview_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format(
                        pose_output[0], pose_output[1], pose_output[2]),
                    (20, 40),
                    cv2.FONT_HERSHEY_DUPLEX,
                    1, (255, 0, 0), 3)

            if 'ge' in preview_flags:
                cv2.putText(
                    frame,
                    "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        x, y, z),
                    (15, 100),
                    cv2.FONT_HERSHEY_COMPLEX,
                    1, (0, 255, 0), 3)

            image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if frame_counter % 5 == 0:
            mouse_controller.move(x, y)

        if key == 27:
            break

    inference_time = round(time.time() - start_inf_time, 1)
    fps = int(frame_counter) / inference_time
    logger_object.error("counter {} seconds".format(frame_counter))
    logger_object.error("total inference time {} seconds".format(inference_time))
    logger_object.error("fps {} frame/second".format(fps))
    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f:
        f.write('inference time : ' + str(inference_time) + '\n')
        f.write('fps: ' + str(fps) + '\n')
        f.write('Models Loading: '+ str(load_total_time) + '\n')
    logger_object.error('Video stream ended')
    cv2.destroyAllWindows()
    input_feeder.close()
def main():

    # Grab command line args
    args = build_argparser().parse_args()
    flags = args.models_outputs_flags

    logger = logging.getLogger()
    input_file_path = args.input
    input_feeder = None
    if input_file_path.lower() == "cam":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file_path):
            logger.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder("video", input_file_path)

    model_path_dict = {
        'FaceDetection': args.face_detection_model,
        'FacialLandmarks': args.facial_landmarks_model,
        'GazeEstimation': args.gaze_estimation_model,
        'HeadPoseEstimation': args.head_pose_estimation_model
    }

    for file_name_key in model_path_dict.keys():
        if not os.path.isfile(model_path_dict[file_name_key]):
            logger.error("Unable to find specified " + file_name_key +
                         " xml file")
            exit(1)

    fdm = FaceDetection(model_path_dict['FaceDetection'], args.device,
                        args.cpu_extension)
    flm = FacialLandmarks(model_path_dict['FacialLandmarks'], args.device,
                          args.cpu_extension)
    gem = GazeEstimation(model_path_dict['GazeEstimation'], args.device,
                         args.cpu_extension)
    hpem = HeadPoseEstimation(model_path_dict['HeadPoseEstimation'],
                              args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    input_feeder.load_data()
    fdm.load_model()
    flm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        cropped_face, face_coords = fdm.predict(frame, args.prob_threshold)
        if type(cropped_face) == int:
            logger.error("Unable to detect any face.")
            if key == 27:
                break
            continue

        hp_output = hpem.predict(cropped_face)

        left_eye_img, right_eye_img, eye_coords = flm.predict(cropped_face)

        new_mouse_coord, gaze_vector = gem.predict(left_eye_img, right_eye_img,
                                                   hp_output)

        if (not len(flags) == 0):
            preview_frame = frame
            if 'fd' in flags:
                preview_frame = cropped_face
            if 'fld' in flags:
                cv2.rectangle(cropped_face,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(cropped_face,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)

            if 'hp' in flags:
                cv2.putText(
                    preview_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_output[0], hp_output[1], hp_output[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in flags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                left_eye = cv2.line(left_eye_img, (x - w, y - w),
                                    (x + w, y + w), (255, 0, 255), 2)
                cv2.line(left_eye, (x - w, y + w), (x + w, y - w),
                         (255, 0, 255), 2)
                right_eye = cv2.line(right_eye_img, (x - w, y - w),
                                     (x + w, y + w), (255, 0, 255), 2)
                cv2.line(right_eye, (x - w, y + w), (x + w, y - w),
                         (255, 0, 255), 2)
                cropped_face[eye_coords[0][1]:eye_coords[0][3],
                             eye_coords[0][0]:eye_coords[0][2]] = left_eye
                cropped_face[eye_coords[1][1]:eye_coords[1][3],
                             eye_coords[1][0]:eye_coords[1][2]] = right_eye

            cv2.imshow("Visualization", cv2.resize(preview_frame, (500, 500)))

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    input_feeder.close()
Esempio n. 17
0
def infer_on_stream(args, model):
    '''

    :param args: argparser arguments
    :param model: loaded model
    '''

    # get the loaded model instance
    objectDetection = model

    # Handle the input stream
    # Check if the input is a webcam or video or image
    if args.input == 'cam':
        feed = InputFeeder(input_type='cam', flip=1)
        feed.set_camera_properties(args.width, args.height, args.fps)
    elif args.input == 'picam':
        feed = InputFeeder(input_type='picam')
        feed.set_camera_properties(args.width, args.height, args.fps)
    elif args.input.endswith('.jpg') or args.input.endswith(
            '.bmp') or args.input.endswith('.png'):
        feed = InputFeeder(input_type='image', input_file=args.input)
    elif args.input.endswith('.mp4'):
        feed = InputFeeder(input_type='video', input_file=args.input)
    else:
        print(
            "ERROR: Invalid input, it must be CAM, image (.jpg, .bmp or .png) or video (.mp4)!"
        )
        raise NotImplementedError

    feed.load_data()

    # run-time switches
    ui_marking = True
    fps_marking = False
    label_background_color = (125, 175, 75)
    label_text_color = (255, 255, 255)  # white text

    cv2.namedWindow("Frame", cv2.WINDOW_NORMAL)
    cv2.setWindowProperty("Frame", cv2.WND_PROP_FULLSCREEN,
                          cv2.WINDOW_FULLSCREEN)

    # Start recording of output saving is enabled
    if args.save_output:
        now = datetime.datetime.now()
        out = cv2.VideoWriter(now.strftime("out-%Y%m%d-%H%M%S.avi"),
                              cv2.VideoWriter_fourcc(*'MJPG'), 15,
                              (args.width, args.height))

    for batch in feed.next_batch():
        if batch is None:
            continue
        # start measuring overall execution time
        start_processing_time = time.time()
        # 1) First detect objects on the image
        start_object_infer_time = time.time()  # time measurement started
        objects = objectDetection.predict(batch)
        total_object_infer_time = time.time(
        ) - start_object_infer_time  # time measurement finished

        # executed only if there are objects on the image
        if len(objects) > 0:

            # if UI marking is turned on draw the vectors, rectangles, etc
            if ui_marking:
                # objects bounding boxes
                for obj in objects:
                    # draw the bounding box
                    cv2.rectangle(batch, (obj['xmin'], obj['ymin']),
                                  (obj['xmax'], obj['ymax']), obj['color'], 2)
                    # prepare the label
                    label_text = f"{obj['class']}: {obj['confidence']*100:.3}%"
                    label_size = cv2.getTextSize(label_text,
                                                 cv2.FONT_HERSHEY_SIMPLEX, 0.8,
                                                 1)[0]
                    label_left = obj['xmin']
                    label_top = obj['ymin'] - label_size[1]
                    if (label_top < 1):
                        label_top = 1
                    label_right = label_left + label_size[0]
                    label_bottom = label_top + label_size[1] - 3
                    cv2.rectangle(batch, (label_left - 1, label_top - 6),
                                  (label_right + 1, label_bottom + 1),
                                  label_background_color, -1)
                    cv2.putText(batch, label_text, (label_left, label_bottom),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.8,
                                label_text_color, 1)

        # Measure overall FPS
        total_processing_time = time.time() - start_processing_time
        if total_processing_time == 0:
            total_processing_time = 0.001  # handle zero division
        total_fps = 1 / (total_processing_time)

        # if FPS marking run time switch is turned on print some details on the image
        if fps_marking:
            label_text = f"FPS: {total_fps:.3}"
            cv2.putText(batch, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 0, 255), 1)
            label_text = f"Object detection inference time: {total_object_infer_time*1000:.4}ms"
            cv2.putText(batch, label_text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 0, 255), 1)

        # Show the output image and save the output video
        cv2.imshow('Frame', batch)
        if args.save_output:
            out.write(batch)

        # Press q on keyboard to exit
        # Press r on keyboard to toggle roll compensation
        # Press u on keyboard to toggle ui drawings
        # Press f on keyboard to fps drawings
        ret = cv2.waitKey(20)
        if ret & 0xFF == ord('q'):
            break
        elif ret & 0xFF == ord('u'):
            ui_marking = not ui_marking
        elif ret & 0xFF == ord('f'):
            fps_marking = not fps_marking

    # close the feed when stopping and finish the video saving
    #feed.close()
    if args.save_output:
        out.release()
Esempio n. 18
0
    def infer(self, args):
        # Create instances from the models' classes
        FDM_net = ModelFaceDetection()
        HPE_net = ModelHeadPoseEstimation()
        FLD_net = ModelFacialLandmarksDetection()
        GEM_net = ModelGazeEstimation()
        mouse_controller = MouseController('high', 'fast')

        # Load the models
        start1 = time.time()
        FDM_net.load_model(args.face_detection_model, args.device)
        FDM_load_t = time.time() - start1

        start2 = time.time()
        HPE_net.load_model(args.head_pose_estimation_model, args.device)
        HPE_load_t = time.time() - start2

        start3 = time.time()
        FLD_net.load_model(args.facial_landmarks_detection_model, args.device)
        FLD_load_t = time.time() - start3

        start4 = time.time()
        GEM_net.load_model(args.gaze_estimation_model, args.device)
        GEM_load_t = time.time() - start4

        print('All models are loaded!')

        #Check the inputs
        # To make the mouse moving we need video stream either from camera or video path
        if args.input.lower() == 'cam':
            # Initialise the InputFeeder class
            input_feeder = InputFeeder(input_type='cam', input_file=args.input)
        else:
            if not os.path.isfile(args.input):
                log.error("Please insert valid video path to run the app.")
                exit()
            # Initialise the InputFeeder class
            input_feeder = InputFeeder(input_type='video',
                                       input_file=args.input)

        # Load the video capture
        input_feeder.load_data()

        # Inference time
        inference = time.time()

        # Read from the video capture
        for flag, frame in input_feeder.next_batch():
            if not flag:
                break
            key_pressed = cv2.waitKey(60)

            # Run inference on the models
            start5 = time.time()
            face_coords = FDM_net.predict(frame)
            FDM_infer_t = time.time() - start5

            # crop the face from the frame
            cropped_face = frame[face_coords[1]:face_coords[3],
                                 face_coords[0]:face_coords[2]]

            #Everything depends on the face detection output, if no face detected then repeat
            if len(face_coords) == 0:
                log.error("There is no faces detected.")
                continue
            start6 = time.time()
            HP_angles = HPE_net.predict(cropped_face, face_coords)
            HPE_infer_t = time.time() - start6

            if args.display_flag:
                #### display the face
                O_frame = cv2.rectangle(frame.copy(),
                                        (face_coords[0], face_coords[1]),
                                        (face_coords[2], face_coords[3]),
                                        (255, 255, 0), 2)

                #### display the pose angles
                # Link for pose estimation output code resource: https://sudonull.com/post/6484-Intel-OpenVINO-on-Raspberry-Pi-2018-harvest
                cos_r = cos(HP_angles[2] * pi / 180)
                sin_r = sin(HP_angles[2] * pi / 180)
                cos_y = cos(HP_angles[0] * pi / 180)
                sin_y = sin(HP_angles[0] * pi / 180)
                cos_p = cos(HP_angles[1] * pi / 180)
                sin_p = sin(HP_angles[1] * pi / 180)

                x = int((face_coords[0] + face_coords[2]) / 2)
                y = int((face_coords[1] + face_coords[3]) / 2)
                cv2.line(O_frame, (x, y),
                         (x + int(65 *
                                  (cos_r * cos_y + sin_y * sin_p * sin_r)),
                          y + int(65 * cos_p * sin_r)), (255, 0, 0),
                         thickness=2)
                cv2.line(O_frame, (x, y),
                         (x + int(65 *
                                  (cos_r * sin_y * sin_p + cos_y * sin_r)),
                          y - int(65 * cos_p * cos_r)), (0, 255, 0),
                         thickness=2)
                cv2.line(O_frame, (x, y),
                         (x + int(65 * sin_y * cos_p), y + int(65 * sin_p)),
                         (0, 0, 255),
                         thickness=2)

            start7 = time.time()
            l_e, r_e, l_e_image, r_e_image, e_center = FLD_net.predict(
                O_frame, cropped_face, face_coords)
            FLD_infer_t = time.time() - start7

            ###display landmarks for both eyes
            if args.display_flag:
                cv2.circle(O_frame,
                           (face_coords[0] + l_e[0], face_coords[1] + l_e[1]),
                           29, (0, 255, 255), 2)
                cv2.circle(O_frame,
                           (face_coords[0] + r_e[0], face_coords[1] + r_e[1]),
                           29, (0, 255, 255), 2)

            start8 = time.time()
            g_vec = GEM_net.predict(l_e_image, r_e_image, HP_angles)
            GEM_infer_t = time.time() - start8

            ###display gaze model output
            if args.display_flag:
                cv2.arrowedLine(O_frame,
                                (int(e_center[0][0]), int(e_center[0][1])),
                                (int(e_center[0][0]) + int(g_vec[0] * 90),
                                 int(e_center[0][1]) + int(-g_vec[1] * 90)),
                                (203, 192, 255), 2)
                cv2.arrowedLine(O_frame,
                                (int(e_center[1][0]), int(e_center[1][1])),
                                (int(e_center[1][0]) + int(g_vec[0] * 90),
                                 int(e_center[1][1]) + int(-g_vec[1] * 90)),
                                (203, 192, 255), 2)

            # change the pointer position according to the estimated gaze direction
            mouse_controller.move(g_vec[0], g_vec[1])

            if key_pressed == 27:
                break

            # Display the resulting frame
            cv2.imshow('Mouse Controller App Results',
                       cv2.resize(O_frame, (750, 550)))

        inference_time = time.time() - inference

        print("Loading time: \n1-Face detection: " + str(FDM_load_t) +
              "\n2- Head pose estimation: " + str(HPE_load_t) +
              "\n3-Facial landmarks model: " + str(FLD_load_t) +
              "\n4-Gaze estimation model:  " + str(GEM_load_t))
        print("Output inference time: \n1-Face detection: " +
              str(FDM_infer_t) + "\n2- Head pose estimation: " +
              str(HPE_infer_t) + "\n3-Facial landmarks model: " +
              str(FLD_infer_t) + "\n4-Gaze estimation model:  " +
              str(GEM_infer_t))

        # close the input feeder and destroy all opened windows
        input_feeder.close()
        cv2.destroyAllWindows
def main(args):

    # getting the arguments
    if args.get_perf_counts.lower() == "true":
        perf_counts = True
    elif args.get_perf_counts.lower() == "false":
        perf_counts = False
    precision = args.precision.lower()
    speed = args.speed.lower()
    media_type = args.media_type.lower()
    media_path = args.media_file
    toggle_ui = args.show_video
    print(toggle_ui)
    batch_size = args.batch_size
    device = args.device
    iterations = 1 if media_type == "cam" else int(args.iterations)

    #initialize the mouse object
    mouse = MouseController(precision, speed)

    # Initialize the input feeder
    feed = InputFeeder(media_type, batch_size, media_path)

    # Initialize and load the inference models
    model = Model(face_detection, facial_landmarks, gaze_estimation,
                  head_pose_estimation, device)
    model.load_models()

    for _ in range(iterations):

        feed.load_data()

        #This will be used as a way to keep track of the average time for the preprocessing and inference of the models
        times = np.zeros((8, ))
        counter_frames = 0

        if media_type != "image":
            width = feed.cap.get(3)
            height = feed.cap.get(4)
        else:
            height, width, _ = feed.cap.shape
        try:
            for frame in feed.next_batch(media_type):
                counter_frames += 1
                #generates the prediction
                x, y, gaze_vector, times = model.predict(
                    frame, width, height, times)
                #generates the movement on the cursor
                mouse.move(x, y)

                if perf_counts:
                    cv2.putText(
                        frame, "Preprocess Face Detection: " +
                        str(times[0] / counter_frames * 1000) + " ms", (0, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Inference Face Detection: " +
                        str(times[1] / counter_frames * 1000) + " ms",
                        (0, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Preprocess Facial Landmarks: " +
                        str(times[2] / counter_frames * 1000) + " ms",
                        (0, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Inference Facial Landmarks: " +
                        str(times[3] / counter_frames * 1000) + " ms",
                        (0, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Preprocess Head Pose: " +
                        str(times[4] / counter_frames * 1000) + " ms",
                        (0, 250), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Inference Head Pose: " +
                        str(times[5] / counter_frames * 1000) + " ms",
                        (0, 300), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Preprocess Gaze Estimation: " +
                        str(times[6] / counter_frames * 1000) + " ms",
                        (0, 350), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    cv2.putText(
                        frame, "Inference Gaze Estimation: " +
                        str(times[7] / counter_frames * 1000) + " ms",
                        (0, 400), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3)
                    print("Preprocess Face Detection: " +
                          str(times[0] / counter_frames * 1000) + " ms")
                    print("Inference Face Detection: " +
                          str(times[1] / counter_frames * 1000) + " ms")
                    print("Preprocess Facial Landmarks: " +
                          str(times[2] / counter_frames * 1000) + " ms")
                    print("Inference Facial Landmarks: " +
                          str(times[3] / counter_frames * 1000) + " ms")
                    print("Preprocess Head Pose: " +
                          str(times[4] / counter_frames * 1000) + " ms")
                    print("Inference Head Pose: " +
                          str(times[5] / counter_frames * 1000) + " ms")
                    print("Preprocess Gaze Estimation: " +
                          str(times[6] / counter_frames * 1000) + " ms")
                    print("Inference Gaze Estimation: " +
                          str(times[7] / counter_frames * 1000) + " ms")

                if toggle_ui == True:
                    cv2.imshow("Frame", frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                if cv2.waitKey(1) & 0xFF == ord('i'):
                    toggle_UI = False if toggle_UI else True

        except:
            print("Video has ended or couldn't continue")
        if perf_counts:
            print("Final average: ")
            print("Preprocess Face Detection: " +
                  str(times[0] / counter_frames * 1000) + " ms")
            print("Inference Face Detection: " +
                  str(times[1] / counter_frames * 1000) + " ms")
            print("Preprocess Facial Landmarks: " +
                  str(times[2] / counter_frames * 1000) + " ms")
            print("Inference Facial Landmarks: " +
                  str(times[3] / counter_frames * 1000) + " ms")
            print("Preprocess Head Pose: " +
                  str(times[4] / counter_frames * 1000) + " ms")
            print("Inference Head Pose: " +
                  str(times[5] / counter_frames * 1000) + " ms")
            print("Preprocess Gaze Estimation: " +
                  str(times[6] / counter_frames * 1000) + " ms")
            print("Inference Gaze Estimation: " +
                  str(times[7] / counter_frames * 1000) + " ms")
        feed.close()
        cv2.destroyAllWindows()
def main():

    args = get_args().parse_args()
    path_filender = args.input
    four_flags = args.flags_checker
    loger = logging.getLogger()
    feeder_in = None
    out_path = args.out_path

    if path_filender.lower() == "cam":
        feeder_in = InputFeeder("cam")
    else:
        if not os.path.isfile(path_filender):
            loger.error("The video was not found")
            exit(1)
        feeder_in = InputFeeder("video", path_filender)

    model_locations = {
        'FaceDetection': args.face_detection_model,
        'HeadPoseEstimation': args.head_pose_estimation_model,
        'FacialLandmarksDetection': args.facial_landmarks_detection_model,
        'GazeEstimation': args.gaze_estimation_model
    }

    for key_name in model_locations.keys():
        if not os.path.isfile(model_locations[key_name]):
            loger.error("The system cannot find the " + key_name + " xml file")
            exit(1)

    dt = FaceDetection(model_locations['FaceDetection'], args.device,
                       args.cpu_extension)
    pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device,
                            args.cpu_extension)
    ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'],
                                  args.device, args.cpu_extension)
    ge = GazeEstimation(model_locations['GazeEstimation'], args.device,
                        args.cpu_extension)

    cursor = MouseController('medium', 'fast')

    feeder_in.load_data()
    model_load_time_start = time.time()
    dt.load_model()
    pe.load_model()
    ld.load_model()
    ge.load_model()
    total_load_time = time.time() - model_load_time_start

    frame_counter = 0
    inference_time_start = time.time()
    for ret, frame in feeder_in.next_batch():
        if not ret:
            break
        frame_counter = frame_counter + 1
        if frame_counter % 1 == 0:
            cv2.imshow('video', cv2.resize(frame, (600, 600)))

        key = cv2.waitKey(60)

        face_detected, coords_face = dt.predict(frame, args.p_th)
        if type(face_detected) == int:
            loger.error("The system cannot detect any face.")
            if key == 27:
                break
            continue

        head_pose_output = pe.predict(face_detected)
        eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict(
            face_detected)
        coordi_update_pointer, coordi_gaze = ge.predict(
            eye_left_detect, eye_right_detect, head_pose_output)

        if (not len(four_flags) == 0):
            result_app = frame
            if 'fad' in four_flags:
                result_app = face_detected
            if 'hpe' in four_flags:
                cv2.putText(
                    result_app,
                    "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}".
                    format(head_pose_output[0], head_pose_output[1],
                           head_pose_output[2]), (5, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0)
            if 'fld' in four_flags:
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[0][0] - 4,
                               eye_coordinates_detect[0][1] - 4),
                              (eye_coordinates_detect[0][2] + 4,
                               eye_coordinates_detect[0][3] + 4),
                              (255, 255, 0), 4)
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[1][0] - 4,
                               eye_coordinates_detect[1][1] - 4),
                              (eye_coordinates_detect[1][2] + 4,
                               eye_coordinates_detect[1][3] + 4),
                              (255, 255, 0), 4)
            if 'gae' in four_flags:
                x = int(coordi_gaze[0] * 2)
                y = int(coordi_gaze[1] * 2)
                w = 150
                right_E = cv2.line(eye_right_detect, (x - w, y - w),
                                   (x + w, y + w), (51, 255, 153), 1)
                cv2.line(right_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                left_E = cv2.line(eye_left_detect, (x - w, y - w),
                                  (x + w, y + w), (51, 255, 153), 1)
                cv2.line(left_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                face_detected[
                    eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3],
                    eye_coordinates_detect[1][0]:eye_coordinates_detect[1]
                    [2]] = right_E
                face_detected[
                    eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3],
                    eye_coordinates_detect[0][0]:eye_coordinates_detect[0]
                    [2]] = left_E

            cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600)))

        if frame_counter % 5 == 0:
            cursor.move(coordi_update_pointer[0], coordi_update_pointer[1])
        if key == 27:
            break

    total_time = time.time() - inference_time_start
    total_time_for_inference = round(total_time, 1)
    fps = frame_counter / total_time_for_inference

    with open(out_path + 'stats.txt', 'w') as f:
        f.write('Inference time: ' + str(total_time_for_inference) + '\n')
        f.write('FPS: ' + str(fps) + '\n')
        f.write('Model load time: ' + str(total_load_time) + '\n')

    loger.error("The video stream is over...")
    cv2.destroyAllWindows()
    feeder_in.close()
class MoveMouse:
    '''
    Main Class for the Mouse Controller app. 
    This is the class where all the models are stitched together to control the mouse pointer
    '''
    def __init__(self, args):
        '''
        This method instances variables for the Facial Landmarks Detection Model.

        Args:
        args = All arguments parsed by the arguments parser function

        Return:
        None
        '''

        init_start_time = time.time()
        self.output_path = args.output_path
        self.show_output = args.show_output
        self.total_processing_time = 0
        self.count_batch = 0
        self.inference_speed = []
        self.avg_inference_speed = 0

        if args.all_devices != 'CPU':
            args.face_device = args.all_devices
            args.face_landmark_device = args.all_devices
            args.head_pose_device = args.all_devices
            args.gaze_device = args.all_devices

        model_init_start = time.time()
        self.face_model = FaceDetection(args.face_model, args.face_device,
                                        args.face_device_ext,
                                        args.face_prob_threshold)
        self.landmarks_model = FacialLandmarksDetection(
            args.face_landmark_model, args.face_landmark_device,
            args.face_landmark_device_ext, args.face_landmark_prob_threshold)
        self.head_pose_model = HeadPoseEstimation(
            args.head_pose_model, args.head_pose_device,
            args.head_pose_device_ext, args.head_pose_prob_threshold)
        self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device,
                                         args.gaze_device_ext,
                                         args.gaze_prob_threshold)
        self.model_init_time = time.time() - model_init_start
        log.info('[ Main ] All required models initiallized')

        self.mouse_control = MouseController(args.precision, args.speed)
        log.info('[ Main ] Mouse controller successfully initialized')

        self.input_feeder = InputFeeder(args.batch_size, args.input_type,
                                        args.input_file)
        log.info('[ Main ] Initialized input feeder')

        model_load_start = time.time()
        self.face_model.load_model()
        self.landmarks_model.load_model()
        self.head_pose_model.load_model()
        self.gaze_model.load_model()

        self.model_load_time = time.time() - model_load_start
        self.app_init_time = time.time() - init_start_time
        log.info('[ Main ] All moadels loaded to Inference Engine\n')

        return None

    def draw_face_box(self, frame, face_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of faces drawn on it
        '''

        start_point = (face_coords[0][0], face_coords[0][1])
        end_point = (face_coords[0][2], face_coords[0][3])
        thickness = 5
        color = (255, 86, 0)

        frame = cv2.rectangle(frame, start_point, end_point, color, thickness)

        return frame

    def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of left and right eyes drawn on it
        '''

        left_eye_start_point = (left_eye_coords[0], left_eye_coords[1])
        left_eye_end_point = (left_eye_coords[2], left_eye_coords[3])
        right_eye_start_point = (right_eye_coords[0], right_eye_coords[1])
        right_eye_end_point = (right_eye_coords[2], right_eye_coords[3])
        thickness = 5
        color = (0, 210, 0)

        frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point,
                              color, thickness)
        frame = cv2.rectangle(frame, right_eye_start_point,
                              right_eye_end_point, color, thickness)

        return frame

    def draw_outputs(self, frame):
        '''
        Draws the inference outputs (bounding boxes of the face and both eyes and 
        the 3D head pose directions) of the four models onto the frames.

        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with all inference outputs drawn on it
        '''

        frame = self.draw_face_box(frame, self.face_coords)
        frame = self.draw_eyes_boxes(frame, self.left_eye_coords,
                                     self.right_eye_coords)

        frame_id = f'Batch id = {self.count_batch}'
        avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps'
        total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s'

        cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45,
                    (255, 86, 0), 1)
        cv2.putText(frame, avg_inference_speed, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)
        cv2.putText(frame, total_processing_time, (15, 45),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)

        return frame

    def run_inference(self, frame):
        '''
        Performs inference on the input video or image by passing it through all four
        models to get the desired coordinates for moving the mouse pointer.

        Args:
        frame = Input image, frame from video or camera feed

        Return:
        None
        '''

        self.input_feeder.load_data()

        for frame in self.input_feeder.next_batch():

            if self.input_feeder.frame_flag == True:
                log.info('[ Main ] Started processing a new batch')
                start_inference = time.time()
                self.face_coords, self.face_crop = self.face_model.predict(
                    frame)

                if self.face_coords == []:
                    log.info(
                        '[ Main ] No face detected.. Waiting for you to stare at the camera'
                    )
                    f.write('[ Error ] No face was detected')

                else:
                    self.head_pose_angles = self.head_pose_model.predict(
                        self.face_crop)
                    self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict(
                        self.face_crop)
                    self.x, self.y = self.gaze_model.predict(
                        self.left_eye_image, self.right_eye_image,
                        self.head_pose_angles)
                    log.info(
                        f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]'
                    )

                    batch_process_time = time.time() - start_inference
                    self.total_processing_time += batch_process_time
                    self.count_batch += 1
                    log.info(
                        f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n'
                    )

                    self.mouse_control.move(self.x, self.y)

                    if self.show_output:
                        self.draw_outputs(frame)

                    cv2.imshow('Computer Pointer Controller Output', frame)
                    self.inference_speed.append(self.count_batch /
                                                self.total_processing_time)
                    self.avg_inference_speed = sum(self.inference_speed) / len(
                        self.inference_speed)

                with open(os.path.join(self.output_path, 'outputs.txt'),
                          'w+') as f:
                    f.write('INFERENCE STATS\n')
                    f.write(
                        f'Total model initialization time : {self.model_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total model load time: {self.model_load_time:.2f}s\n'
                    )
                    f.write(
                        f'App initialization time: {self.app_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total processing time: {self.total_processing_time:.2f}s\n'
                    )
                    f.write(
                        f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n'
                    )
                    f.write(f'Batch count: {self.count_batch}\n\n')

                    f.write('LAST OUTPUTS\n')
                    f.write(f'Face coordinates: {self.face_coords}\n')
                    f.write(f'Left eye coordinates: {self.left_eye_coords}\n')
                    f.write(
                        f'Right eye coordinates: {self.right_eye_coords}\n')
                    f.write(f'Head pose angles: {self.head_pose_angles}\n')
                    f.write(
                        f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]'
                    )

            else:
                self.input_feeder.close()
                cv2.destroyAllWindows()

                log.info(
                    f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s'
                )
                log.info('[ Main ] Shutting down app...')
                log.info('[ Main ] Mouse controller app has been shut down.')
                break

        return
def main():
    args = build_argparser().parse_args()
    logging.basicConfig(filename='../outputs/logging.log', level=logging.DEBUG)

    # get the model objects
    faceDetObj = Model_FaceDetection(model_name=args.facedetecionmodel,
                                     device=args.device,
                                     extensions=args.cpu_extension,
                                     threshold=args.prob_threshold)
    headPoseObj = Model_HeadPoseEstimation(
        model_name=args.headposeestimationmodel,
        device=args.device,
        extensions=args.cpu_extension)
    facialLandmarkObj = Model_FacialLandmarkDetection(
        model_name=args.faciallandmarksdetectionmodel,
        device=args.device,
        extensions=args.cpu_extension)
    gazeEstimationObj = Model_GazeEstimation(
        model_name=args.gazeestimationnmodel,
        device=args.device,
        extensions=args.cpu_extension)

    # load the models
    faceDetObj.load_model()
    headPoseObj.load_model()
    facialLandmarkObj.load_model()
    gazeEstimationObj.load_model()

    # check if we have video or cam stream
    stream = None
    if args.input.upper() == "CAM":
        stream = "cam"
    else:
        stream = "video"

    # get the InputFeeder and MouseController objects
    feedObj = InputFeeder(input_type=stream, input_file=args.input)
    MouseControllerObj = MouseController(precision='high', speed='fast')

    # start processing the video or cam stream frames
    frame_count = 0
    feedObj.load_data()

    for flag, frame in feedObj.next_batch():
        if not flag:
            break
        key_pressed = cv2.waitKey(60)

        frame_count += 1

        coords, cropped_faces = faceDetObj.predict(frame)

        # check if we have detected a face in the frame
        if type(
                cropped_faces[0]
        ) == int:  # faces are sorted starting with the highest probability
            logging.info(
                "FaceDetection did not detect any face - skipping the frame {}"
                .format(frame_count))
            continue

        head_angles = headPoseObj.predict(
            cropped_faces[0]
        )  # faces are sorted starting with the highest probability
        eyes, eyes_coords = facialLandmarkObj.predict(
            cropped_faces[0]
        )  # faces are sorted starting with the highest probability
        gaze_vec = gazeEstimationObj.predict(eyes[0], eyes[1], head_angles)

        # get the mouse pointer coordinates
        mouse_vec = calculate_mouse_vector(head_angles, gaze_vec)

        # visualise the outputs
        if args.visualise.upper() == "FACE":
            cv2.imshow("detected face", cropped_faces[0])
        elif args.visualise.upper() == "EYES":
            pix = 15
            eyes_image = cropped_faces[0].copy()

            # left eye
            x_l = eyes_coords[0][0]
            y_l = eyes_coords[0][1]
            eyes_image = cv2.rectangle(eyes_image, (x_l - pix, y_l - pix),
                                       (x_l + pix, y_l + pix), (0, 55, 255), 1)

            # right eye
            x_r = eyes_coords[1][0]
            y_r = eyes_coords[1][1]
            eyes_image = cv2.rectangle(eyes_image, (x_r - pix, y_r - pix),
                                       (x_r + pix, y_r + pix), (0, 55, 255), 1)

            cv2.imshow("detected eyes", eyes_image)
        elif args.visualise.upper() == "GAZE":
            fin_image = visualise_vector(eyes_coords, cropped_faces[0],
                                         mouse_vec)

        # move mouse pointer
        MouseControllerObj.move(mouse_vec[0], mouse_vec[1])

    cv2.destroyAllWindows()
def main(args):
    device = args.device
    
    precision, speed = args.mouse_precision, args.mouse_speed
    mouse = MouseController(precision=precision, speed=speed)    
    
    #get paths to the models
    modelF, modelG, modelH, modelL = args.modelF, args.modelG, args.modelH, args.modelL
    
    face = Model_Face(modelF, device)
    gaze = Model_Gaze(modelG, device) 
    headpose = Model_HeadPose(modelH, device)    
    landmarks = Model_Landmarks(modelL, device)
    
    face.load_model()
    gaze.load_model()
    headpose.load_model()
    landmarks.load_model()
    
    input_type, input_file = args.input_type, args.input_file
    feed = InputFeeder(input_type=input_type, input_file=input_file)
    vframe_shape = feed.load_data()

    logging.info("Please wait. Processing inference...")
    # Run inference on four models and get outputs
    for batch in feed.next_batch():
        frame_copy = batch.copy()
##face:
        frame4infer_f = face.preprocess_input(batch)
        face_output = face.predict(frame4infer_f)
        #get face bb coordinates:
        f_preprocessed_output = face.preprocess_output(face_output, vframe_shape) 
        xmin,ymin,xmax,ymax = f_preprocessed_output
##headpose:
        frame4infer_h = headpose.preprocess_input(batch)
        # get yaw, pitch and roll head pose angles
        headpose_output = headpose.predict(frame4infer_h)
        h_preprocessed_output = headpose.preprocess_output(headpose_output)
##landmarks:
        #get roi of face
        roi = batch[ymin:ymax, xmin:xmax] 
        frame4infer_l = landmarks.preprocess_input(roi)        
        landmarks_output = landmarks.predict(frame4infer_l)
        # get landmarks coordinates
        l_preprocessed_output = landmarks.preprocess_output(landmarks_output, f_preprocessed_output)
        right_eye, left_eye, nose, right_lip_corner, left_lip_corner = l_preprocessed_output
##gaze        
        r_eye_crop = batch[right_eye[1]-20:right_eye[1]+20, right_eye[0]-20:right_eye[0]+20]
        l_eye_crop = batch[left_eye[1]-20:left_eye[1]+20, left_eye[0]-20:left_eye[0]+20]
        re_blob4infer_g = gaze.preprocess_input(r_eye_crop, 're')
        le_blob4infer_g = gaze.preprocess_input(l_eye_crop, 'le')
        hp_blob4infer_g = gaze.preprocess_input(np.array(h_preprocessed_output), 'hp')        
        gaze_output = gaze.predict(re_blob4infer_g, le_blob4infer_g, hp_blob4infer_g)
        g_preprocessed_output = gaze.preprocess_output(gaze_output, l_preprocessed_output, vframe_shape)
        
        # Get mouse pointer position
        x, y = g_preprocessed_output
          
        # Move a mouse pointer
        mouse.move(x, y)

        if input_type == "image":
            cv2.imwrite("output_image.jpg", frame_copy)
            logging.info("        ! Got output image!")
        
        if input_type == 'video':
            feed.write(frame_copy)
            
        
    feed.close()
    logging.info("End of the processing.")
def inference(args):

    time_sheet = {
        'face_infr': [],
        'landmark_infr': [],
        'head_infr': [],
        'gaze_infr': [],
        'infr_per_frame': []
    }

    logging.basicConfig(filename='result.log', level=logging.INFO)
    logging.info(
        "================================================================================="
    )
    logging.info("Precision(face,landmark,head,gaze): FP32-INT1,FP{0},FP{1},FP{2}".format(\
            args.landmark_model.split("FP")[1].split("\\")[0],
            args.head_model.split("FP")[1].split("\\")[0],
            args.gaze_model.split("FP")[1].split("\\")[0]))

    model_load_start = time.time()

    face_detection = FaceDetection(args.face_model)
    face_detection.load_model()
    landmark_regression = LandmarkRegression(args.landmark_model)
    landmark_regression.load_model()
    head_pose = HeadPose(args.head_model)
    head_pose.load_model()
    gaze_estimation = GazeEstimation(args.gaze_model)
    gaze_estimation.load_model()

    logging.info("4 models load time: {0:.4f}sec".format(time.time() -
                                                         model_load_start))

    mouse_controller = MouseController('high', 'fast')

    cv2.namedWindow('preview', cv2.WND_PROP_FULLSCREEN)
    cv2.setWindowProperty('preview', cv2.WND_PROP_FULLSCREEN,
                          cv2.WINDOW_FULLSCREEN)

    input_feeder = InputFeeder(args.input_type, args.input_file)
    input_feeder.load_data()

    total_infr_start = time.time()

    for image in input_feeder.next_batch():
        if image is None:
            break
        face_infr_start = time.time()
        face_image = face_detection.predict(image)
        time_sheet['face_infr'].append(time.time() - face_infr_start)

        landmark_infr_start = time.time()
        left_eye_image, right_eye_image = landmark_regression.predict(
            np.copy(face_image))
        time_sheet['landmark_infr'].append(time.time() - landmark_infr_start)

        head_infr_start = time.time()
        head_pose_angles = head_pose.predict(np.copy(face_image))
        time_sheet['head_infr'].append(time.time() - head_infr_start)

        gaze_infr_start = time.time()
        x, y, z = gaze_estimation.predict(left_eye_image, right_eye_image,
                                          head_pose_angles)
        time_sheet['gaze_infr'].append(time.time() - gaze_infr_start)
        time_sheet['infr_per_frame'].append(time.time() - face_infr_start)
        cv2.imshow('preview', image)
        mouse_controller.move(x, y)
        key = cv2.waitKey(20)
        if key == 27:  # exit on ESC
            break

    logging.info("Face model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['face_infr'])))
    logging.info("Landmark model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['landmark_infr'])))
    logging.info("Head model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['head_infr'])))
    logging.info("Gaze model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['gaze_infr'])))
    logging.info("4 Model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['infr_per_frame'])))
    logging.info("Total inference time: {0:.4f}sec".format(time.time() -
                                                           total_infr_start))
    logging.info(
        "====================================END==========================================\n"
    )

    input_feeder.close()
    cv2.destroyAllWindows()
Esempio n. 25
0
def main():
    args = build_argparser().parse_args()
    logging.basicConfig(filename=args.output+'/app.log', filemode='w')

    print("Begin: Try not to move mouse with your hands")
    mc = MouseController("low", "fast")
    if args.input == "cam":
        frames = InputFeeder("cam")
    else:
        frames = InputFeeder("video", args.input)
    cap = frames.load_data()

    if args.display:
        initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        out_video = cv2.VideoWriter(os.path.join(args.output, 'output_video.mp4'), cv2.VideoWriter_fourcc('m','p','4','v'), fps, (initial_w, initial_h))


    face_model = FaceDetectionModel(args.face_model, args.output, args.device)
    pose_model = HeadPoseEstimationModel(args.pose_model, args.output, args.device)
    landmarks_model = FacialLandmarksDetectionModel(args.landmarks_model, args.output, args.device)
    gaze_model = GazeEstimationModel(args.gaze_model, args.output, args.device)
    avg_out = 0
    avg = 0
    tmlt_face_avg = 0
    tinpt_face_avg = 0
    tint_face_avg = 0
    toutt_face_avg = 0

    tmlt_pose_avg = 0
    tinpt_pose_avg = 0
    tint_pose_avg = 0
    toutt_pose_avg = 0

    tmlt_landmarks_avg = 0
    tinpt_landmarks_avg = 0
    tint_landmarks_avg = 0
    toutt_landmarks_avg = 0

    tmlt_gaze_avg = 0
    tinpt_gaze_avg = 0
    tint_gaze_avg = 0
    toutt_gaze_avg = 0
    logging.info("Frames starting")
    for frame in frames.next_batch():
        if frame is None:
            logging.error("Frame: " + frame + "failed")
            continue
        output_image = frame.copy()
        cropped_faces, tmlt_face, tinpt_face, tint_face, toutt_face = face_model.predict(frame)
        try:
            largest_face = cropped_faces[0]
            for face in cropped_faces:
                if largest_face.size < face.size:
                    largest_face = face
            pose, tmlt_pose, tinpt_pose, tint_pose, toutt_pose = pose_model.predict(largest_face)
            landmarks, tmlt_landmarks, tinpt_landmarks, tint_landmarks, toutt_landmarks = landmarks_model.predict(largest_face)
            gaze_vector, tmlt_gaze, tinpt_gaze, tint_gaze, toutt_gaze = gaze_model.predict(largest_face, landmarks, pose)
        except Exception as e:
            logging.error("Model inference failed: " + str(e))
            # print(e)
            continue
        if args.display:
            output_image, xmin, ymin = face_model.draw_crop_outputs(output_image, args.display)
            output_image = gaze_model.display_eye_boxes(output_image, landmarks, xmin, ymin, args.display)
            out_video.write(output_image)
        cv2.imshow("output_image", output_image)
        cv2.waitKey(15)
        face_model.coords = []
        tmlt_face_avg += tmlt_face
        tinpt_face_avg += tinpt_face
        tint_face_avg += tint_face
        toutt_face_avg += toutt_face

        tmlt_pose_avg += tmlt_pose
        tinpt_pose_avg += tinpt_pose
        tint_pose_avg += tint_pose
        toutt_pose_avg += toutt_pose

        tmlt_landmarks_avg += tmlt_landmarks
        tinpt_landmarks_avg+= tinpt_landmarks
        tint_landmarks_avg += tint_landmarks
        toutt_landmarks_avg += toutt_landmarks

        if gaze_vector is None:
            avg_out += 1
            continue
        tmlt_gaze_avg += tmlt_gaze
        tinpt_gaze_avg += tinpt_gaze
        tint_gaze_avg += tint_gaze
        toutt_gaze_avg += toutt_gaze
        avg += 1
        gaze_vector_norm = gaze_vector / np.linalg.norm(gaze_vector)
        try:
            mc.move(gaze_vector_norm[0], gaze_vector_norm[1])
        except Exception as e:
            logging.error("Gaze failed: " + str(e))
            # print(e)
            continue

    file_name = "stats_"+args.precision+".txt"
    save_path = os.path.join(os.getcwd(), args.output)
    f = open(os.path.join(save_path, file_name), "w")
    f.write("Benchmark Start:"+"\n\n")
    f.write("Face Detection Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_face_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_face_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_face_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_face_avg/avg)+"\n\n")

    f.write("Head Pose Estimation Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_pose_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_pose_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_pose_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_pose_avg/avg)+"\n\n")

    f.write("Facial Landmarks Detection Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_landmarks_avg/avg)+"\n")
    f.write("Total Input Time:"+str(tinpt_landmarks_avg/avg)+"\n")
    f.write("Total Inference Time:"+str(tint_landmarks_avg/avg)+"\n")
    f.write("Total Output Time:"+str(toutt_landmarks_avg/avg)+"\n\n")

    f.write("Gaze Estimation Model stats"+"\n")
    f.write("Total model Load Time:"+str(tmlt_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Input Time:"+str(tinpt_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Inference Time:"+str(tint_gaze_avg/(avg-avg_out))+"\n")
    f.write("Total Output Time:"+str(toutt_gaze_avg/(avg-avg_out))+"\n\n")
    f.write("Benchmark end"+"\n")
    f.close()

    print("Thank you, Goodbye")
    frames.close()
Esempio n. 26
0
def benchmark(args):
    print("runing benchmark")
    #file=open(args.c)
    #confs=json.loads(file.read())
    

    
    input_type=args.t
    input_files=args.l
    
    
    face_lt_start=time.time()
    face_detect=face_detection(args.fm, args.d, args.p, args.e)

    face_detect.load_model()
    face_lt=time.time()-face_lt_start
    
    
    landmark_lt_start=time.time()
    landmarks_model=LandmarksDetection(args.lm, args.d, args.e)

    landmarks_model.load_model()
    landmark_lt=time.time()-landmark_lt_start

    
    head_pose_lt_start=time.time()
    head_pose=Head_Pose(args.hm, args.d, args.e)
    head_pose.load_model()
    head_pose_lt=time.time()-head_pose_lt_start

    
    gaze_lt_start=time.time()
    gaze_estimation=Gaze_Estimation(args.gm, args.d, args.e)
    gaze_estimation.load_model()
    gaze_lt=time.time()-gaze_lt_start


    feed=InputFeeder(input_type='video', input_file=input_files)

    feed.load_data()

    for batch in feed.next_batch():
        
        face_inf_start=time.time()
        cropped_face=face_detect.predict(batch)
        face_inf_time=time.time()-face_inf_start
      
        landmark_inf_start=time.time()
        cropped_left_eye, cropped_right_eye = landmarks_model.predict(cropped_face)
        landmark_inf_time=time.time()-landmark_inf_start
        
        
        head_pose_inf_start=time.time()
        head_angles = head_pose.predict(cropped_face)
        head_pose_inf_time=time.time()-head_pose_inf_start
        
        
        gaze_inf_start=time.time()
        x,y = gaze_estimation.predict(cropped_left_eye, cropped_right_eye, head_angles)
        gaze_inf_time=time.time()-gaze_inf_start
        
        
        #plotting load_time
        models=['Face_detect', 'landmark_detect', 'Head_pose_est', 'Gaze est']
        loading_times=[face_lt, landmark_lt, head_pose_lt, gaze_lt]
        plot_loading_time(models, loading_times, args.b)
        
        #plotting inference_time
        inference_times=[face_inf_time, landmark_inf_time, head_pose_inf_time, gaze_inf_time]
        plot_inf_time(models, inference_times, args.b)
        
        logging.info("Benchmarking done!")
        
        

        break
    feed.close()
Esempio n. 27
0
def main():
    args = build_argparser().parse_args()

    frame_num = 0
    inference_time = 0
    counter = 0

    # Initialize the Inference Engine
    fd = FaceDetection()
    fld = Facial_Landmarks_Detection()
    ge = Gaze_Estimation()
    hp = Head_Pose_Estimation()

    # Load Models
    fd.load_model(args.face_detection_model, args.device, args.cpu_extension)
    fld.load_model(args.facial_landmark_model, args.device, args.cpu_extension)
    ge.load_model(args.gaze_estimation_model, args.device, args.cpu_extension)
    hp.load_model(args.head_pose_model, args.device, args.cpu_extension)

    # Mouse Controller precision and speed
    mc = MouseController('medium', 'fast')

    # feed input from an image, webcam, or video to model
    if args.input == "cam":
        feed = InputFeeder("cam")
    else:
        assert os.path.isfile(args.input), "Specified input file doesn't exist"
        feed = InputFeeder("video", args.input)
    feed.load_data()
    frame_count = 0
    for frame in feed.next_batch():
        frame_count += 1
        inf_start = time.time()
        if frame is not None:
            try:
                key = cv2.waitKey(60)

                det_time = time.time() - inf_start

                # make predictions
                detected_face, face_coords = fd.predict(
                    frame.copy(), args.prob_threshold)
                hp_output = hp.predict(detected_face.copy())
                left_eye, right_eye, eye_coords = fld.predict(
                    detected_face.copy())
                new_mouse_coord, gaze_vector = ge.predict(
                    left_eye, right_eye, hp_output)

                stop_inference = time.time()
                inference_time = inference_time + stop_inference - inf_start
                counter = counter + 1

                # Visualization
                preview = args.visualization
                if preview:
                    preview_frame = frame.copy()
                    face_frame = detected_face.copy()

                    draw_face_bbox(preview_frame, face_coords)
                    display_hp(preview_frame, hp_output, face_coords)
                    draw_landmarks(face_frame, eye_coords)
                    draw_gaze(face_frame, gaze_vector, left_eye.copy(),
                              right_eye.copy(), eye_coords)

                if preview:
                    img = np.hstack((cv2.resize(preview_frame, (500, 500)),
                                     cv2.resize(face_frame, (500, 500))))
                else:
                    img = cv2.resize(frame, (500, 500))

                cv2.imshow('Visualization', img)

                # set speed
                if frame_count % 5 == 0:
                    mc.move(new_mouse_coord[0], new_mouse_coord[1])

                # INFO
                log.info("NUMBER OF FRAMES: {} ".format(frame_num))
                log.info("INFERENCE TIME: {}ms".format(det_time * 1000))

                frame_num += 1

                if key == 27:
                    break
            except:
                print(
                    'Not supported image or video file format. Please send in a supported video format.'
                )
                exit()
    feed.close()
Esempio n. 28
0
def main(args):
    
    
    input_type=args.t
    input_files=args.l
    flags=args.f
    
    face_detect=Face_Detection(face_model_path, args.d, args.p, args.e)

    face_detect.load_model()

    landmarks_model=LandmarksDetection(landmarks_model_path, args.d, args.e)

    landmarks_model.load_model()

    head_pose=Head_Pose(hpose_model_path, args.d, args.e)
    head_pose.load_model()

    gaze_estimation=Gaze_Estimation(gaze_model_path, args.d, args.e)
    gaze_estimation.load_model()

    if input_type == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_files):
            logging.error("Could not find the input file")
            exit(1)
        feed= InputFeeder(input_type='video', input_file=input_files)
    #feed=InputFeeder(input_type=input_type, input_file= input_files)

    
    try:
        feed.load_data()
    except Exception:
        logging.error("Could not load data from input file", exc_info=True)
    
    
    
    for batch in feed.next_batch():
        
        try:
            
            cropped_face, coords=face_detect.predict(batch)
            
            if type(cropped_face) == int:
                logging.info("Face not detected")
                if key == 27:
                    break
                continue
            
            cropped_left_eye, cropped_right_eye, left_eye_cord, right_eye_cord = landmarks_model.predict(cropped_face)
            head_angles = head_pose.predict(cropped_face)
            x,y = gaze_estimation.predict(cropped_left_eye, cropped_right_eye, head_angles)
        
        except Exception:
            logging.error("An error occured while running predictions", exc_info=True)
        
        if flags != 0:
            
        
            if flags == 'FD':
                cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 3)
            if flags =='FL':
                cv2.rectangle(cropped_face, (left_eye_cord[0], left_eye_cord[1]), (left_eye_cord[2], left_eye_cord[3]), (255, 0, 0), 3)
                cv2.rectangle(cropped_face, (right_eye_cord[0], right_eye_cord[1]), (right_eye_cord[2], right_eye_cord[3]), (255, 0, 0), 3)
            if flags =='HP':
                cv2.putText(batch,
                "Head angles: yaw={:.2f} , pitch={:.2f}, roll={:.2f}".format(
                    head_angles[0], head_angles[1], head_angles[2]),
                            (20, 40),
                            cv2.FONT_HERSHEY_COMPLEX,
                            1, (255, 0, 255), 2)
            if flags == 'GE':
                
                left_eye_mid_x= (left_eye_cord[2]-left_eye_cord[0])/2 + left_eye_cord[0]
                left_eye_mid_y=(left_eye_cord[3]-left_eye_cord[1])/2 + left_eye_cord[1]
                
                right_eye_mid_x=(right_eye_cord[2]-right_eye_cord[0])/2 + right_eye_cord[0]
                right_eye_mid_y=(right_eye_cord[3]- right_eye_cord[1])/2 + right_eye_cord[1]
                
                left_eye_new_x=int(left_eye_mid_x + x*160)
                left_eye_new_y=int(left_eye_mid_y + y*160*-1)
                right_eye_new_x=int(right_eye_mid_x + x*160)
                right_eye_new_y=int(right_eye_mid_y + y*160*-1)
                cv2.line(cropped_face, (int(left_eye_mid_x), int(left_eye_mid_y)), (int(left_eye_new_x), int(left_eye_new_y)), (255, 0, 255), 5)
                cv2.line(cropped_face, (int(right_eye_mid_x), int(right_eye_mid_y)), (int(right_eye_new_x), int(right_eye_new_y)), (255, 0, 255), 5)
                
        

                
                
        mouse=MouseController(precision='low', speed='fast')
        mouse.move(x,y)    
        
        
        batch = imutils.resize(batch, width=500)
        cv2.imshow('frame', batch)
        key = cv2.waitKey(1) & 0xFF
    feed.close()
Esempio n. 29
0
def main():

    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarksModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    bbox_flag = args.bbox_flag
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    face_detection_model = Face_detection(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    facial_landmarks_detection_model = Landmark_Detection(
        model_path_dict['FacialLandmarksModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = Head_pose(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = Gaze_estimation(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    is_benchmarking = False

    if not is_benchmarking:
        mouse_controller = MouseController('medium', 'fast')

    start_model_load_time = time.time()
    face_detection_model.load_model()
    facial_landmarks_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1920, 1080), True)

    frame_count = 0
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_coords, image_copy = face_detection_model.predict(frame)

            if type(image_copy) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_coords = facial_landmarks_detection_model.predict(
                image_copy)
            hp_output = head_pose_estimation_model.predict(image_copy)
            mouse_coords, gaze_coords = gaze_estimation_model.predict(
                left_eye, right_eye, hp_output)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        image = cv2.resize(frame, (500, 500))

        if not len(bbox_flag) == 0:
            bbox_frame = draw_bbox(frame, bbox_flag, image_copy, left_eye,
                                   right_eye, face_coords, eye_coords,
                                   hp_output, gaze_coords)
            image = np.hstack(
                (cv2.resize(frame,
                            (500, 500)), cv2.resize(bbox_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(frame)

        if frame_count % 5 == 0 and not is_benchmarking:
            mouse_controller.move(mouse_coords[0], mouse_coords[1])

        if key == 27:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    try:
        os.mkdir(output_path)
    except OSError as error:
        logger.error(error)

    with open(output_path + 'stats.txt', 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))

    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
Esempio n. 30
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetection(args.fdmodel)
        lmnet = FacialLandmarks(args.lmmodel)
        hpnet = HeadPoseEstimation(args.hpmodel)
        genet = GazeEstimation(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        fdnet.check_model()
        logging.info("Face Detection estimation layers loaded correctly")

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        lmnet.check_model()
        logging.info("Facial Landmarks estimation layers loaded correctly")

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        hpnet.check_model()
        logging.info("Head pose estimation layers loaded correctly")

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        genet.check_model()
        logging.info("Gaze estimation layers loaded correctly")
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            fd_process_time = time.time()
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)
            logging.info(
                "Face Detection Model processing time : {:.1f}ms".format(
                    1000 * (time.time() - fd_process_time)))

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                lm_process_time = time.time()
                p_frame = lmnet.preprocess_input(face)
                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)
                logging.info(
                    "Landmarks model processing time : {:.1f}ms".format(
                        1000 * (time.time() - lm_process_time)))

                # get head pose estimation
                hp_process_time = time.time()
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)
                logging.info(
                    "Headpose estimation model processing time : {:.1f}ms".
                    format(1000 * (time.time() - hp_process_time)))

                # get gaze  estimation
                gaze_process_time = time.time()
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)
                logging.info(
                    "Gaze estimation model processing time : {:.1f}ms".format(
                        1000 * (time.time() - gaze_process_time)))

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))