Beispiel #1
0
def main():
    args = build_argparser().parse_args()

    preview_flags = args.preview_flags

    logger = logging.getLogger()
    input_path = args.input

    if input_path.lower() == 'cam':
        input_feed = InputFeeder('cam')
    else:
        if not os.path.isfile(input_path):
            logger.error('Unable to find specified video file')
            exit(1)
        file_extension = input_path.split(".")[-1]
        if (file_extension in ['jpg', 'jpeg', 'bmp']):
            input_feed = InputFeeder('image', input_path)
        elif (file_extension in ['avi', 'mp4']):
            input_feed = InputFeeder('video', input_path)
        else:
            logger.error(
                "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']"
            )
            exit(1)

    if sys.platform == "linux" or sys.platform == "linux2":
        #CODEC = 0x00000021
        CODEC = cv2.VideoWriter_fourcc(*"mp4v")
    elif sys.platform == "darwin":
        CODEC = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
    else:
        print("Unsupported OS.")
        exit(1)

    file_flag = False
    if args.output_file.lower() == 'y':
        file_flag = True
        out = cv2.VideoWriter('output.mp4', CODEC, 30,
                              (FRAME_WIDTH, FRAME_HEIGHT))

    modelPathDict = {
        'face_detect': args.face_detection_model,
        'face_landmark_regress': args.facial_landmark_model,
        'head_pose': args.head_pose_model,
        'gaze_estimate': args.gaze_estimation_model
    }

    for pathname in modelPathDict:
        if not os.path.isfile(modelPathDict[pathname]):
            logger.error('Unable to find specified ' + pathname + ' xml file')
            exit(1)

    #initializing models
    fdm = FaceDetectionModel(modelPathDict['face_detect'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['face_landmark_regress'], args.device,
        args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['head_pose'], args.device,
                                   args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['gaze_estimate'], args.device,
                              args.cpu_extension)

    #initializing mouse controller
    mouse_controller = MouseController('medium', 'fast')

    input_feed.load_data()

    #checking models
    fdm.check_model()
    fldm.check_model()
    hpem.check_model()
    gem.check_model()

    #loading models / creating executable network
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in input_feed.next_batch():
        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)
        """
        Sequence of model execution:-
        1. Predict from each model.
        2. Preprocess of outputs from each model.
        3. Send the processed output to the next model.

        Model Sequence:- 
                                -   Head Pose Estimation Model      -
        Face Detection Model <(First Head Pose and Then Facial Landmark)>Gaze Estimation Model 
                                -   Facial Landmark Detection Model -  
        """

        cropped_face, face_coords = fdm.preprocess_output(
            frame.copy(), fdm.predict(frame.copy()), args.prob_threshold)

        if type(cropped_face) == int:
            logger.error('Unable to detect the face.')
            if key == 27:
                break
            continue

        hp_out = hpem.preprocess_output(hpem.predict(cropped_face.copy()))

        left_eye, right_eye, eye_coords = fldm.preprocess_output(
            cropped_face.copy(), fldm.predict(cropped_face.copy()))

        new_mouse_coord, gaze_vector = gem.preprocess_output(
            gem.predict(left_eye, right_eye, hp_out), hp_out)

        if (not len(preview_flags) == 0) or file_flag:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                preview_frame = cv2.rectangle(preview_frame,
                                              (face_coords[0], face_coords[1]),
                                              (face_coords[2], face_coords[3]),
                                              (0, 0, 255), 3)
                cropped_face = preview_frame[face_coords[1]:face_coords[3],
                                             face_coords[0]:face_coords[2]]

            if 'fld' in preview_flags:
                cropped_face = cv2.rectangle(
                    cropped_face,
                    (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                    (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                    (0, 255, 0), 3)
                cropped_face = cv2.rectangle(
                    cropped_face,
                    (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                    (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                    (0, 255, 0), 3)

                preview_frame[face_coords[1]:face_coords[3],
                              face_coords[0]:face_coords[2]] = cropped_face

            if 'hp' in preview_flags:
                cv2.putText(
                    preview_frame,
                    'Pose Angles: yaw: {:.2f} | pitch: {:.2f} | roll: {:.2f}'.
                    format(hp_out[0], hp_out[1], hp_out[2]), (20, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

            if 'ge' in preview_flags:

                x, y = int(gaze_vector[0] * GAZE_ARROW_LENGTH), -int(
                    gaze_vector[1] * GAZE_ARROW_LENGTH)

                le_mid_x = int((eye_coords[0][0] + eye_coords[0][2]) / 2)
                le_mid_y = int((eye_coords[0][1] + eye_coords[0][3]) / 2)
                re_mid_x = int((eye_coords[1][0] + eye_coords[1][2]) / 2)
                re_mid_y = int((eye_coords[1][1] + eye_coords[1][3]) / 2)

                cv2.arrowedLine(cropped_face, (le_mid_x, le_mid_y),
                                ((le_mid_x + x), (le_mid_y + y)), (255, 0, 0),
                                GAZE_ARROW_WIDTH)
                cv2.arrowedLine(cropped_face, (re_mid_x, re_mid_y),
                                ((re_mid_x + x), (re_mid_y + y)), (255, 0, 0),
                                GAZE_ARROW_WIDTH)

                preview_frame[face_coords[1]:face_coords[3],
                              face_coords[0]:face_coords[2]] = cropped_face

            if (not len(preview_flags) == 0) and frame_count % 2 == 0:
                if args.zoomed:
                    cv2.imshow(
                        'Cropped Face',
                        cv2.resize(cropped_face, (FRAME_WIDTH, FRAME_HEIGHT)))
                else:
                    cv2.imshow(
                        'Preview',
                        cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT)))

            if file_flag:
                out.write(
                    cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT)))

        #move the mouse pointer
        try:
            mouse_controller.move(new_mouse_coord[0], new_mouse_coord[1])
        except pyautogui.FailSafeException:
            pass

        if frame_count % 2 == 0 and len(preview_flags) == 0:
            cv2.imshow('Video', cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT)))

        if key == 27:
            break

    logger.error('VideoStream ended.')
    if args.output_file.lower() == 'y':
        out.release()
    input_feed.close()
    cv2.destroyAllWindows()
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.DEBUG,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("debug.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        start_time = time.time()
        fdnet.load_model()
        logging.info(
            f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms")

        start_time = time.time()
        lmnet.load_model()
        logging.info(
            f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        hpnet.load_model()
        logging.info(
            f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        genet.load_model()
        logging.info(
            f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()

        frame_count = 0

        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0

        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fd_output = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, bboxes = fdnet.preprocess_output(
                fd_output, frame, args.print)

            for bbox in bboxes:

                face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lm_output = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lm_output, bbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hp_output = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angles = hpnet.preprocess_output(
                    hp_output, out_frame, face, bbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                ge_output = genet.predict(left_eye, right_eye, headpose_angles)
                ge_infertime += time.time() - start_time
                out_frame, gaze_vector = genet.preprocess_output(
                    ge_output, out_frame, bbox, left_eye_point,
                    right_eye_point, args.print)

                if not args.no_video:
                    cv2.imshow('image', out_frame)

                if not args.no_move:
                    mc.move(gaze_vector[0], gaze_vector[1])

                break

            if key_pressed == 27:
                break

        if frame_count > 0:
            logging.info(
                f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms")
            logging.info(
                f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms"
            )
            logging.info(
                f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms")
            logging.info(
                f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms")

        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception(f"Error during inference:{str(ex)}")
Beispiel #3
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args):
    face_detection_model_file = args.faceDetectionModel
    facial_landmarks_detection_model_file = args.facialLandmarksModel
    head_pose_estimation_model_file = args.headPoseModel
    gaze_estimation_model_file = args.gazeModel

    video_file = args.input
    device_name = args.device
    cpu_extension = args.cpu_extension
    prob_threshold = args.prob_threshold
    preview_flag = args.preview_flag

    output_path = args.output_path
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    mouse_control = MouseController("low", "fast")

    try:
        logging.info("*********** Model Load Time ***************")
        start_model_load_time = time.time()

        start_time = time.time()
        face_detection_model = FaceDetectionModel(face_detection_model_file,
                                                  device_name, cpu_extension)
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        facial_landmarks_detection_model = FacialLandmarksDetectionModel(
            facial_landmarks_detection_model_file, device_name, cpu_extension)
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        head_pose_estimation_model = HeadPoseEstimationModel(
            head_pose_estimation_model_file, device_name, cpu_extension)
        logging.info("Head Pose Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        start_time = time.time()
        gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file,
                                                    device_name, cpu_extension)
        logging.info("Gaze Estimation Model: {:.1f} ms.".format(
            1000 * (time.time() - start_time)))

        total_model_load_time = time.time() - start_model_load_time
        logging.info("*********** Model Load Completed ***********")
    except Exception as e:
        logging.error("ERROR in model loading: " + str(e))
        sys.exit(1)

    feeder = InputFeeder('video', video_file)
    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.fps() / 10), (1920, 1080), True)

    start_inference_time = 0
    frame_count = 0
    face_detect_infer_time = 0
    facial_landmarks_infer_time = 0
    head_pose_infer_time = 0
    gaze_infer_time = 0

    while True:
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break

        key_pressed = cv2.waitKey(60)
        frame_count += 1

        ## Face Detecton Model
        image = face_detection_model.preprocess_input(frame)

        start_time = time.time()
        outputs = face_detection_model.predict(image)
        face_detect_infer_time += (time.time() - start_time)
        out_frame, faces = face_detection_model.preprocess_output(
            outputs, frame, preview_flag, prob_threshold)

        for face in faces:
            crop_image = frame[face[1]:face[3], face[0]:face[2]]

            ## Facial Landmarks Detecton Model
            image = facial_landmarks_detection_model.preprocess_input(
                crop_image)

            start_time = time.time()
            outputs = facial_landmarks_detection_model.predict(image)
            facial_landmarks_infer_time += (time.time() - start_time)
            out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output(
                outputs, out_frame, face, preview_flag)

            ## Head Pose Estimation Model
            image = head_pose_estimation_model.preprocess_input(crop_image)

            start_time = time.time()
            outputs = head_pose_estimation_model.predict(image)
            head_pose_infer_time += (time.time() - start_time)
            out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output(
                outputs, out_frame, preview_flag)

            ## Gaze Estimation Model
            out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(
                out_frame, crop_image, left_eye_point, right_eye_point)

            start_time = time.time()
            outputs = gaze_estimation_model.predict(left_eye, right_eye,
                                                    headpose_angels_list)
            gaze_infer_time += (time.time() - start_time)
            out_frame, gazevector = gaze_estimation_model.preprocess_output(
                outputs, out_frame, face, left_eye_point, right_eye_point,
                preview_flag)

            cv2.imshow("Computer Pointer Control", out_frame)
            out_video.write(out_frame)
            mouse_control.move(gazevector[0], gazevector[1])

        if key_pressed == 27:
            break

    if frame_count > 0:
        logging.info("*********** Model Inference Time ****************")
        logging.info("Face Detection Model: {:.1f} ms.".format(
            1000 * face_detect_infer_time / frame_count))
        logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format(
            1000 * facial_landmarks_infer_time / frame_count))
        logging.info("Head Pose Detection Model: {:.1f} ms.".format(
            1000 * head_pose_infer_time / frame_count))
        logging.info("Gaze Detection Model: {:.1f} ms.".format(
            1000 * gaze_infer_time / frame_count))
        logging.info("*********** Model Inference Completed ***********")

    total_infer_time = time.time() - start_inference_time
    total_inference_time = round(total_infer_time, 1)
    fps = frame_count / total_inference_time

    with open(os.path.join(output_path, 'stats.txt'), 'w') as f:
        f.write(str(total_inference_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_model_load_time) + '\n')

    logging.info("*********** Total Summary ****************")
    logging.info(f"Total Model Load Time: {total_model_load_time}")
    logging.info(f"Total Inference Time: {total_inference_time}")
    logging.info(f"FPS: {fps}")
    logging.info("*********** Total Summary ***********")
    logging.info("*********** ************************* ***********")

    feeder.close()
    cv2.destroyAllWindows()