Beispiel #1
0
def main():
    args = build_argparser().parse_args()
    device_name = args.device
    prob_threshold = args.prob_threshold
    logger_object = log.getLogger()

    # Initialize variables with the input arguments
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarkModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    # Instantiate model
    face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold)
    landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name,
                                              threshold=prob_threshold)
    head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name,
                                         threshold=prob_threshold)
    gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold)
    mouse_controller = MouseController('medium', 'fast')

    # Load Models and get time
    start_time = time.time()
    face_model.load_model()
    logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000))

    first_mark = time.time()
    landmark_model.load_model()
    logger_object.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000))

    second_mark = time.time()
    head_pose_model.load_model()
    logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000))

    third_mark = time.time()
    gaze_model.load_model()
    logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000))
    logger_object.error("All models are loaded successfully..")

    # Check extention of these unsupported layers
    face_model.check_model()
    landmark_model.check_model()
    head_pose_model.check_model()
    gaze_model.check_model()

    preview_flags = args.previewFlags
    input_filename = args.input
    output_path = args.output_path
    prob_threshold = args.prob_threshold

    if input_filename.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger_object.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger_object.error("Unable to find specified model file" + str(model_path))
            exit(1)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps,
                                (width, height), True)

    frame_counter = 0
    start_inf_time = time.time()
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_counter += 1
        key = cv2.waitKey(60)

        try:
            cropped_image, face_cords = face_model.predict(frame, prob_threshold)

            if type(cropped_image) == int:
                print("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image)
            pose_output = head_pose_model.predict(cropped_image)
            x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords)

            mouse_controller.move(x, y)
        except Exception as e:
            print(str(e) + " for frame " + str(frame_counter))
            continue

        image = cv2.resize(frame, (width, height))
        if not len(preview_flags) == 0:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                if len(preview_flags) != 1:
                    preview_frame = cropped_image
                    cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3)

            if 'hp' in preview_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format(
                        pose_output[0], pose_output[1], pose_output[2]),
                    (20, 40),
                    cv2.FONT_HERSHEY_DUPLEX,
                    1, (255, 0, 0), 3)

            if 'ge' in preview_flags:
                cv2.putText(
                    frame,
                    "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        x, y, z),
                    (15, 100),
                    cv2.FONT_HERSHEY_COMPLEX,
                    1, (0, 255, 0), 3)

            image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if frame_counter % 5 == 0:
            mouse_controller.move(x, y)

        if key == 27:
            break

    inference_time = round(time.time() - start_inf_time, 1)
    fps = int(frame_counter) / inference_time
    logger_object.error("counter {} seconds".format(frame_counter))
    logger_object.error("total inference time {} seconds".format(inference_time))
    logger_object.error("fps {} frame/second".format(fps))
    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f:
        f.write('inference time : ' + str(inference_time) + '\n')
        f.write('fps: ' + str(fps) + '\n')
        f.write('Models Loading: '+ str(load_total_time) + '\n')
    logger_object.error('Video stream ended')
    cv2.destroyAllWindows()
    input_feeder.close()
Beispiel #2
0
def main(args):
    # enable logging for the function
    logger = logging.getLogger(__name__)

    # grab the parsed parameters
    faceModel = args.m_f
    facial_LandmarksModel = args.m_l
    headPoseEstimationModel = args.m_h
    GazeEstimationModel = args.m_g
    device = args.d
    inputFile = args.i
    output_path = args.o_p
    modelArchitecture = args.modelAr
    visualization_flag = args.vf

    # initialize feed
    single_image_format = ['jpg', 'tif', 'png', 'jpeg', 'bmp']
    if inputFile.split(".")[-1].lower() in single_image_format:
        feed = InputFeeder('image', inputFile)
    elif args.i == 'cam':
        feed = InputFeeder('cam')
    else:
        feed = InputFeeder('video', inputFile)

    ##Load model time face detection
    faceStart_model_load_time = time.time()
    faceDetection = FaceDetection(faceModel, device)
    faceModelView = faceDetection.load_model()
    faceDetection.check_model()
    total_facemodel_load_time = time.time() - faceStart_model_load_time

    ##Load model time headpose estimatiom
    heaadposeStart_model_load_time = time.time()
    headPose = headPoseEstimation(headPoseEstimationModel, device)
    headPoseModelView = headPose.load_model()
    headPose.check_model()
    heaadposeTotal_model_load_time = time.time(
    ) - heaadposeStart_model_load_time

    ##Load model time face_landmarks estimation
    face_landmarksStart_model_load_time = time.time()
    face_landmarks = Face_landmarks(facial_LandmarksModel, device)
    faceLandmarksModelView = face_landmarks.load_model()
    face_landmarks.check_model()
    face_landmarksTotal_model_load_time = time.time(
    ) - face_landmarksStart_model_load_time

    ##Load model time face_landmarks estimation
    GazeEstimationStart_model_load_time = time.time()
    GazeEstimation = Gaze_Estimation(GazeEstimationModel, device)
    GazeModelView = GazeEstimation.load_model()
    GazeEstimation.check_model()
    GazeEstimationTotal_model_load_time = time.time(
    ) - GazeEstimationStart_model_load_time

    if modelArchitecture == 'yes':
        print("The model architecture of gaze mode is ", GazeModelView)
        print("model architecture for landmarks is", faceLandmarksModelView)
        print("model architecture for headpose is", headPoseModelView)
        print("model architecture for face is", faceModelView)

        # count the number of frames
    frameCount = 0
    input_feeder = InputFeeder('video', inputFile)
    w, h = feed.load_data()
    for _, frame in feed.next_batch():

        if not _:
            break
        frameCount += 1
        key = cv2.waitKey(60)
        start_imageface_inference_time = time.time()
        imageface = faceDetection.predict(frame, w, h)
        imageface_inference_time = time.time() - start_imageface_inference_time

        if 'm_f' in visualization_flag:
            cv2.imshow('cropped face', imageface)

        if type(imageface) == int:
            logger.info("no face detected")
            if key == 27:
                break
            continue

        start_imagePose_inference_time = time.time()
        imageAngles, imagePose = headPose.predict(imageface)
        imagePose_inference_time = time.time() - start_imagePose_inference_time

        if 'm_h' in visualization_flag:
            cv2.imshow('Head Pose Angles', imagePose)

        start_landmarkImage_inference_time = time.time()
        leftEye, rightEye, landmarkImage = face_landmarks.predict(imageface)
        landmarkImage_inference_time = time.time(
        ) - start_landmarkImage_inference_time

        if leftEye.any() == None or rightEye.any() == None:
            logger.info(
                "image probably too dark or eyes covered, hence could not detect landmarks"
            )
            continue

        if 'm_l' in visualization_flag:
            cv2.imshow('Face output', landmarkImage)

        start_GazeEstimation_inference_time = time.time()
        x, y = GazeEstimation.predict(leftEye, rightEye, imageAngles)
        GazeEstimation_inference_time = time.time(
        ) - start_GazeEstimation_inference_time

        if 'm_g' in visualization_flag:
            #             cv2.putText(landmarkedFace, "Estimated x:{:.2f} | Estimated y:{:.2f}".format(x,y), (10,20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0,255,0),1)
            cv2.imshow('Gaze Estimation', landmarkImage)

        mouseVector = MouseController('medium', 'fast')

        if frameCount % 5 == 0:
            mouseVector.move(x, y)

        if key == 27:
            break

        if imageface_inference_time != 0 and landmarkImage_inference_time != 0 and imagePose_inference_time != 0 and GazeEstimation_inference_time != 0:

            fps_face = 1 / imageface_inference_time
            fps_landmark = 1 / landmarkImage_inference_time
            fps_headpose = 1 / imagePose_inference_time
            fps_gaze = 1 / GazeEstimation_inference_time

            with open(
                    os.path.join(output_path, device, 'face',
                                 'face_stats.txt'), 'w') as f:
                f.write(str(imageface_inference_time) + '\n')
                f.write(str(fps_face) + '\n')
                f.write(str(total_facemodel_load_time) + '\n')

            with open(
                    os.path.join(output_path, device, 'landmark',
                                 'landmark_stats.txt'), 'w') as f:
                f.write(str(landmarkImage_inference_time) + '\n')
                f.write(str(fps_landmark) + '\n')
                f.write(str(face_landmarksTotal_model_load_time) + '\n')

            with open(
                    os.path.join(output_path, device, 'headpose',
                                 'headpose_stats.txt'), 'w') as f:
                f.write(str(imagePose_inference_time) + '\n')
                f.write(str(fps_headpose) + '\n')
                f.write(str(heaadposeTotal_model_load_time) + '\n')

            with open(
                    os.path.join(output_path, device, 'gaze',
                                 'gaze_stats.txt'), 'w') as f:
                f.write(str(GazeEstimation_inference_time) + '\n')
                f.write(str(fps_gaze) + '\n')
                f.write(str(GazeEstimationTotal_model_load_time) + '\n')

    logger.info("The End")
    VIS = visualize(output_path, device)
    VIS.visualize1()
    VIS.visualize2()
    VIS.visualize3()
    cv2.destroyAllWindows()
    feed.close()
Beispiel #3
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetection(args.fdmodel)
        lmnet = FacialLandmarks(args.lmmodel)
        hpnet = HeadPoseEstimation(args.hpmodel)
        genet = GazeEstimation(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        fdnet.check_model()
        logging.info("Face Detection estimation layers loaded correctly")

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        lmnet.check_model()
        logging.info("Facial Landmarks estimation layers loaded correctly")

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        hpnet.check_model()
        logging.info("Head pose estimation layers loaded correctly")

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        genet.check_model()
        logging.info("Gaze estimation layers loaded correctly")
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            fd_process_time = time.time()
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)
            logging.info(
                "Face Detection Model processing time : {:.1f}ms".format(
                    1000 * (time.time() - fd_process_time)))

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                lm_process_time = time.time()
                p_frame = lmnet.preprocess_input(face)
                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)
                logging.info(
                    "Landmarks model processing time : {:.1f}ms".format(
                        1000 * (time.time() - lm_process_time)))

                # get head pose estimation
                hp_process_time = time.time()
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)
                logging.info(
                    "Headpose estimation model processing time : {:.1f}ms".
                    format(1000 * (time.time() - hp_process_time)))

                # get gaze  estimation
                gaze_process_time = time.time()
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)
                logging.info(
                    "Gaze estimation model processing time : {:.1f}ms".format(
                        1000 * (time.time() - gaze_process_time)))

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
Beispiel #4
0
    head_start_time = time.time()
    head_pose_model.load_model()
    head_pose_time = (time.time() - head_start_time) * 1000

    facial_landmark_start = time.time()
    facial_landmark_model.load_model()
    facial_landmark_time = (time.time() - facial_landmark_start) * 1000

    gaze_model_start = time.time()
    gaze_estimation_model.load_model()
    gaze_model_time = (time.time() - gaze_model_start) * 1000

    total_loading_time = (time.time() - start_time) * 1000

    face_model.check_model()
    head_pose_model.check_model()
    facial_landmark_model.check_model()
    gaze_estimation_model.check_model()

    if input_file.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_file):
            logger.error("Unable to find video file for input")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_file)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))