def infer(args, logging_enabled): """ run inference on input video, display/save output video """ face_detection = FaceDetection(args.face_detection) facial_landmark_detection = FacialLandmarkDetection( args.facial_landmark_detection) gaze_estimation = GazeEstimation(args.gaze_estimation) head_pose_estimation = HeadPoseEstimation(args.head_pose_estimation) load_start = now() face_detection.load_model() fl_start = now() facial_landmark_detection.load_model() ge_start = now() gaze_estimation.load_model() hp_start = now() head_pose_estimation.load_model() log_model_load_times(logging_enabled, load_start, fl_start, ge_start, hp_start) feeder = InputFeeder("video", args.input) feeder.load_data() frame_count, fd_time, fl_time, ge_time, hp_time = [0] * 5 while 1: key = cv2.waitKey(20) try: frame = next(feeder.next_batch()) except StopIteration: break frame_count += 1 fd_frame = face_detection.preprocess_input(frame) inf_start = now() fd_output = face_detection.predict(fd_frame) fd_time += now() - inf_start out_frame, faces = face_detection.preprocess_output( fd_output, frame, args.overlay_inference, args.probability_threshold) detected_face = frame[faces[0][1]:faces[0][3], faces[0][0]:faces[0][2]] fl_frame = facial_landmark_detection.preprocess_input(detected_face) fl_start = now() fl_output = facial_landmark_detection.predict(fl_frame) fl_time += now() - fl_start out_frame, l_coord, r_coord, = facial_landmark_detection.preprocess_output( fl_output, faces[0], out_frame, args.overlay_inference) hp_frame = head_pose_estimation.preprocess_input(detected_face) hp_start = now() hp_output = head_pose_estimation.predict(hp_frame) hp_time += now() - hp_start out_frame, head_pose = head_pose_estimation.preprocess_output( hp_output, out_frame, detected_face, faces[0], args.overlay_inference) out_frame, l_eye, r_eye = gaze_estimation.preprocess_input( out_frame, detected_face, l_coord, r_coord, args.overlay_inference) ge_start = now() ge_output = gaze_estimation.predict(head_pose, l_eye, r_eye) ge_time += now() - ge_start out_frame, g_vec = gaze_estimation.preprocess_output( ge_output, out_frame, faces[0], l_coord, r_coord, args.overlay_inference) if args.video_window: cv2.imshow( "Computer-Human Interface Peripheral Signal Manipulation via AI Retina Tracking (CHIPSMART)", out_frame, ) if args.mouse_control and frame_count % 6 == 0: mouse_control.move(g_vec[0], g_vec[1]) # Quit if user presses Esc or Q if key in (27, 81): user_quit(logging_enabled) break log_inference_times(logging_enabled, frame_count, fd_time, fl_time, ge_time, hp_time) feeder.close() cv2.destroyAllWindows() quit()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetection(args.fdmodel) lmnet = FacialLandmarks(args.lmmodel) hpnet = HeadPoseEstimation(args.hpmodel) genet = GazeEstimation(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) fdnet.check_model() logging.info("Face Detection estimation layers loaded correctly") start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) lmnet.check_model() logging.info("Facial Landmarks estimation layers loaded correctly") start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) hpnet.check_model() logging.info("Head pose estimation layers loaded correctly") start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) genet.check_model() logging.info("Gaze estimation layers loaded correctly") logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection fd_process_time = time.time() p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) logging.info( "Face Detection Model processing time : {:.1f}ms".format( 1000 * (time.time() - fd_process_time))) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] lm_process_time = time.time() p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) logging.info( "Landmarks model processing time : {:.1f}ms".format( 1000 * (time.time() - lm_process_time))) # get head pose estimation hp_process_time = time.time() p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) logging.info( "Headpose estimation model processing time : {:.1f}ms". format(1000 * (time.time() - hp_process_time))) # get gaze estimation gaze_process_time = time.time() out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) logging.info( "Gaze estimation model processing time : {:.1f}ms".format( 1000 * (time.time() - gaze_process_time))) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def main(): """ Load inference networks, stream video to network, and output stats and video. :return: None """ # Logger init logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") # Get command line args args = get_arg() #Load Preferencies with open(args.config_file, "r") as yamlfile: cfg = yaml.load(yamlfile, Loader=yaml.FullLoader) models = cfg['models'] input_source = args.input video_path = cfg['video_path'] face_model = FaceDetection(models['face_detection']) head_pose_model = HeadPoseEstimation(models['head_pose_estimation']) facial_landmarks_model = FacialLandmarksDetection(models['facial_landmarks_detection']) gaze_estimation_model = GazeEstimation(models['gaze_estimation']) # Initialise the MouseController mouse_contr = MouseController("low","fast") # Load the models and log timing start_time = time.time() face_model.load_model(args.device) logging.info("Load Face Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() facial_landmarks_model.load_model(args.device) logging.info("Load Facial Landmarks Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() head_pose_model.load_model(args.device) logging.info("Load Head Pose Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() gaze_estimation_model.load_model(args.device) logging.info("Load Gaze Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) # Get and open video or camera capture #input_feed = InputFeeder('video', args.input) #input_feed.load_data() input_feed = InputFeeder(input_type=input_source, input_file=video_path) input_feed.load_data() if not input_feed.cap.isOpened(): log.critical('Error opening input, check --video_path parameter') sys.exit(1) # FPS = input_feed.get_fps() # Grab the shape of the input # width = input_feed.get_width() # height = input_feed.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### facedetect_infer_time = 0 landmark_infer_time = 0 headpose_infer_time = 0 gaze_infer_time = 0 while True: # Read the next frame try: frame = next(input_feed.next_batch()) except StopIteration: break if frame is None: break key_pressed = cv2.waitKey(60) frame_count += 1 input_height, input_width, _ = frame.shape logging.info("frame {count} size {w}, {h}".format(count= frame_count, w = input_width, h =input_height)) # face detection p_frame = face_model.preprocess_input(frame) start_time = time.time() fnoutput = face_model.predict(p_frame) facedetect_infer_time += time.time() - start_time out_frame,fboxes = face_model.preprocess_output(fnoutput,frame,args.overlay, args.prob_threshold) #for each face for fbox in fboxes: face = frame[fbox[1]:fbox[3],fbox[0]:fbox[2]] p_frame = facial_landmarks_model.preprocess_input(face) start_time = time.time() lmoutput = facial_landmarks_model.predict(p_frame) landmark_infer_time += time.time() - start_time out_frame,left_eye_point,right_eye_point = facial_landmarks_model.preprocess_output(lmoutput, fbox, out_frame,args.overlay, args.prob_threshold) # get head pose estimation p_frame = head_pose_model.preprocess_input(face) start_time = time.time() hpoutput = head_pose_model.predict(p_frame) headpose_infer_time += time.time() - start_time out_frame, headpose_angels = head_pose_model.preprocess_output(hpoutput,out_frame, face,fbox,args.overlay, args.prob_threshold) # get gaze estimation out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(out_frame,face,left_eye_point,right_eye_point,args.overlay) start_time = time.time() geoutput = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels) gaze_infer_time += time.time() - start_time out_frame, gazevector = gaze_estimation_model.preprocess_output(geoutput,out_frame,fbox, left_eye_point,right_eye_point,args.overlay, args.prob_threshold) cv2.imshow('im', out_frame) if(args.mouse_move): logging.info("mouse move vector : x ={}, y={}".format(gazevector[0], gazevector[1])) mouse_contr.move(gazevector[0], gazevector[1]) #use only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if(frame_count>0): logging.info("***** Models Inference time *****") logging.info("Face Detection:{:.1f}ms".format(1000* facedetect_infer_time/frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format(1000* landmark_infer_time/frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format(1000* headpose_infer_time/frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format(1000* gaze_infer_time/frame_count)) # Release the capture and destroy any OpenCV windows input_feed.close() cv2.destroyAllWindows()