def load_all_models(args): model_path_dict = { 'FaceDetectionModel': args.face_detection_model, 'FacialLandmarksDetectionModel': args.facial_landmarks_model, 'GazeEstimationModel': args.gaze_estimation_model, 'HeadPoseEstimationModel': args.head_pose_model } for fileNameKey in model_path_dict.keys(): if not os.path.isfile(model_path_dict[fileNameKey] + ".xml"): logging.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fd_model = FaceDetectionModel(model_path_dict['FaceDetectionModel'], args.threshold, args.device, args.cpu_extension) fld_model = FacialLandmarksDetectionModel( model_path_dict['FacialLandmarksDetectionModel'], args.threshold, args.device, args.cpu_extension) ge_model = GazeEstimationModel(model_path_dict['GazeEstimationModel'], args.threshold, args.device, args.cpu_extension) hpe_model = HeadPoseEstimationModel( model_path_dict['HeadPoseEstimationModel'], args.threshold, args.device, args.cpu_extension) start_time = time.time() fd_model.load_model() fld_model.load_model() ge_model.load_model() hpe_model.load_model() total_model_load_time = time.time() - start_time return fd_model, fld_model, ge_model, hpe_model, total_model_load_time
def __init__(self, device='CPU', mouse_con=False, face_dec=None, fac_land=None, head_pose=None, gaze=None, show_video=False, save_video=False): ''' all models should be put in here ''' if face_dec and fac_land and head_pose and gaze: self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel( face_dec, device=device), FacialLandmarksDetection( fac_land, device=device), Head_Pose_Estimation( head_pose, device=device), Gaze_Estimation(gaze, device=device) self.face_dec.load_model() self.fac_land.load_model() self.head_pose.load_model() self.gaze.load_model() else: raise ValueError('Missing Arguments') if mouse_con: self.mouse_con = MouseController("low", "fast") self.show_video, self.save_video = show_video, save_video
def init_models(args, logger): face_detec = None fac_land = None head_pose = None gaze_est = None models_info = { 'face_detection_model': args.face_detection_model, 'facial_landmarks_detection_model': args.facial_landmarks_detection_model, 'head_pose_estimation_model': args.head_pose_estimation_model, 'gaze_estimation_model': args.gaze_estimation_model } for model_name in models_info.keys(): if not os.path.isfile(models_info[model_name]): logger.error("Unable to find the model file of " + str(model_name)) exit(1) #Init classes face_detec = FaceDetectionModel(models_info['face_detection_model'], args.device, args.cpu_extension, args.threshold) fac_land = FacialLandmarksDetectionModel( models_info['facial_landmarks_detection_model'], args.device, args.cpu_extension) head_pose = HeadPoseEstimationModel( models_info['head_pose_estimation_model'], args.device, args.cpu_extension) gaze_est = GazeEstimationModel(models_info['gaze_estimation_model'], args.device, args.cpu_extension) return face_detec, fac_land, head_pose, gaze_est
def model_instants(args): face_detection_instant = FaceDetectionModel(model_name=args.face_detection, device=args.device, threshold=args.prob_threshold, extensions=args.cpu_extension) head_pose_estimation_instant = HeadPoseEstimationModel( model_name=args.head_pose_estimation, device=args.device, extensions=args.cpu_extension) facial_landmarks_instant = FacialLandmarksDetectionModel( model_name=args.facial_landmarks_detection, device=args.device, extensions=args.cpu_extension) gaze_estimation_instant = GazeEstimationModel( model_name=args.gaze_estimation, device=args.device, extensions=args.cpu_extension) mouse_controller_instant = MouseController('medium', 'fast') return face_detection_instant, head_pose_estimation_instant, facial_landmarks_instant, gaze_estimation_instant, mouse_controller_instant
def main(args): feed = InputFeeder(input_type=args.it, input_file=args.i) face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p)) face_model.load_model() landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c) landmarks_model.load_model() headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c) headpose_model.load_model() gaze_model = GazeEstimationModel(args.gem, args.d, args.c) gaze_model.load_model() mouse = MouseController("medium", "fast") feed.load_data() for batch in feed.next_batch(): # try: cropped_face, coords, _ = face_model.predict(batch) cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 2) left_eye, right_eye, eyes_coords, _ = landmarks_model.predict( cropped_face) head_pose_angles, _ = headpose_model.predict(cropped_face) x, y, z, _ = gaze_model.predict(left_eye, right_eye, head_pose_angles, cropped_face, eyes_coords) mouse.move(x, y) cv2.imshow("img", batch) if cv2.waitKey(25) & 0xFF == ord('q'): break # except: # print("Frame without prediction. Error: ", sys.exc_info()[0]) # log.error(sys.exc_info()[0]) feed.close()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("debug.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) start_time = time.time() fdnet.load_model() logging.info( f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms") start_time = time.time() lmnet.load_model() logging.info( f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() hpnet.load_model() logging.info( f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() genet.load_model() logging.info( f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fd_output = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, bboxes = fdnet.preprocess_output( fd_output, frame, args.print) for bbox in bboxes: face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lm_output = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lm_output, bbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hp_output = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angles = hpnet.preprocess_output( hp_output, out_frame, face, bbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() ge_output = genet.predict(left_eye, right_eye, headpose_angles) ge_infertime += time.time() - start_time out_frame, gaze_vector = genet.preprocess_output( ge_output, out_frame, bbox, left_eye_point, right_eye_point, args.print) if not args.no_video: cv2.imshow('image', out_frame) if not args.no_move: mc.move(gaze_vector[0], gaze_vector[1]) break if key_pressed == 27: break if frame_count > 0: logging.info( f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms") logging.info( f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms" ) logging.info( f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms") logging.info( f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms") feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception(f"Error during inference:{str(ex)}")
def main(): # Grab command line args args = build_args().parse_args() # Config Logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger() #os.system('clear') print("\n") logger.info("starting app ...") print("\n==========<COMPUTER POINTER CONTROLLER>==========") print("============>(c) Ibrahim Ishaka 2020<============\n") # initialize model object for each class FDModel = FaceDetectionModel(model=args.face_detection_model, device=args.device, extensions=args.extension, threshold=args.prob_threshold) FLDModel = FacialLandmarksDetectionModel(model=args.facial_landmark_model, device=args.device, extensions=args.extension) HPEModel = HeadPoseEstimationModel(model=args.head_pose_model, device=args.device, extensions=args.extension) GEModel = GazeEstimationModel(model=args.gaze_estimation_model, device=args.device, extensions=args.extension) models = {'fd': FDModel, 'fl': FLDModel, 'hp': HPEModel, 'ge': GEModel} models_loading_time = 0 for k in models: # load model logger.info("Loading {} Model".format(models[k].model_name)) model_loading_start = time.time() models[k].load_model() model_loading_finish = (time.time() - model_loading_start) models_loading_time = models_loading_time + model_loading_finish logger.info("time taken to load Model: {:.3f}secs".format( model_loading_finish)) # check if model output visualization is specified in sh arg if k in args.show_output or args.show_output == 'all': models[k].show = True logger.info("show {} outputs: {} \n".format(models[k].model_name, models[k].show)) logger.info("time taken to load All Models: {:.3f}secs\n".format( models_loading_time)) # setting for mouse controller _precision = "medium" _speed = "fast" mouse_controller = MouseController(precision=_precision, speed=_speed) # verify and handle input stream input_source = args.input input_feeder = None input_type = "" if input_source.lower() != "cam": # check if input file exist if os.path.exists(input_source) and os.path.isfile(input_source): image_formats = [".png", ".jpg", ".bmp", ".jpeg"] is_image = [ True for x in image_formats if input_source.endswith(x) ] if is_image: input_type = "image" else: input_type = "video" input_feeder = InputFeeder(input_type=input_type, input_file=input_source) else: logger.error("Input file is not a file, or does't exist") sys.exit(1) elif input_source.lower() == "cam": input_type = "cam" input_feeder = InputFeeder(input_type=input_type) input_feeder.load_data() frame_count = 0 total_inference_time_all = 0 window_closed = False for flag, frame in input_feeder.next_batch(): if flag is False: # no frame to read break frame_count = frame_count + 1 key_pressed = cv2.waitKey(60) if input_source == 'cam': # preprocess frame as webcam is backwards/inverted frame = cv2.flip(frame, 1) face_detection_result = FDModel.predict(frame) # The prediction result should return None, if no face detected if face_detection_result is None: if not window_closed: cv2.imshow(input_type, cv2.resize(frame, (500, 500))) logger.info("NO FACE DETECTED... skipping") continue cropped_face = face_detection_result[0] face_coords = face_detection_result[1] hp_result = HPEModel.predict(cropped_face) left_eye, right_eye = FLDModel.predict(cropped_face) new_mouse_coords, gaze_vector = GEModel.predict( left_eye, right_eye, hp_result) total_inference_time = 0 for key in models: total_inference_time = total_inference_time + models[ key].inference_time total_inference_time_all = total_inference_time_all + total_inference_time #uncomment the following line to see the inference time for each frame #logger.info("Inference Time : {:.3f}".format(total_inference_time)) try: x, y = new_mouse_coords except: logger.error( "unable to get mouse coordinates for current frame\nReading Next Frame..." ) continue if GEModel.show == True: GEModel.show_gaze(left_eye, right_eye, gaze_vector) if HPEModel.show == True: frame = HPEModel.show_hp(frame, hp_result) if new_mouse_coords is None: # Error during LR_eyes processing continue ''' wait on before moving mouse again this is recomended to avoid failsafe exception but you change this setting ''' if input_type == "image": cv2.imshow(input_type, cv2.resize(frame, (500, 500))) mouse_controller.move(x, y) break if frame_count % 5 == 0: try: logger.info("changing mouse position... moving") mouse_controller.move(x, y) except pyautogui.FailSafeException: logger.error("safe exception From pyautogui") continue if not window_closed: cv2.imshow(input_type, cv2.resize(frame, (500, 500))) # Break if escape key pressed if key_pressed == 27: break # close the OpenCV window if q key pressed if key_pressed == ord('q'): window_closed = True cv2.destroyWindow(input_type) logger.info(input_type + " window closed... to exit app, press CTRL+Z") if frame_count != 0: # Release the capture and destroy any OpenCV window input_feeder.close() cv2.destroyAllWindows() logger.info("Stream ended !") fps = round(frame_count / total_inference_time_all, 2) print("\n==========SUMMARY===========") print("models loading time : ", round(models_loading_time, 2)) print("frames per seconds : ", fps) print("total inference time : ", round(total_inference_time_all, 2)) print("============================") else: logger.error("Unable to handle Unsupported file ") sys.exit(1)
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ # --- INPUT --- # Initialize the input_type input_type = None # Check if the input is a webcam if args.input == 'CAM': input_type = 'cam' # Check if the input is an image elif args.input.endswith(('.jpg', '.bmp', '.png')): input_type = 'image' # Check if the input is a video elif args.input.endswith(('.mp4', '.avi')): input_type = 'video' else: sys.exit( f"[ ERRO ] The format of the input file '{args.input.endswith}' is not supported." ) # Initialize the InputFeeder input_feeder = InputFeeder(input_type, args.input) input_feeder.load_data() # --- MODELS --- # Load the Face Detection Model face_detection_model = FaceDetectionModel( model_xml_path=args.model_face_detection, device=args.device, extensions_path=args.cpu_extension, ) face_detection_model.load_model() # Load the Head Pose Estimation Model head_pose_estimation_model = HeadPoseEstimationModel( model_xml_path=args.model_head_pose, device=args.device, extensions_path=args.cpu_extension, ) head_pose_estimation_model.load_model() # Load the Facial Landmarks Detection Model facial_landmarks_detection_model = FacialLandmarksDetectionModel( model_xml_path=args.model_face_landmark, device=args.device, extensions_path=args.cpu_extension, ) facial_landmarks_detection_model.load_model() # Load the Gaze Estimation Model gaze_estimation_model = GazeEstimationModel( model_xml_path=args.model_gaze_estimation, device=args.device, extensions_path=args.cpu_extension, ) gaze_estimation_model.load_model() # --- POINTER CONTROLLER --- pointer_controller = MouseController( precision='medium', speed='medium', ) # --- WINDOW --- # Set the window to fullscreen # cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) # cv2.setWindowProperty(WINDOW_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) # Initialize list to track the inference time list_inference_time = [] #Loop until stream is over for frame in input_feeder.next_batch(): # If there is no frame break the loop if frame is None: break # start the timer start_time = time.time() # Initialize the frame to be displayed display_frame = frame # --- DETECT HEAD --- # Detect the head on the frame list_heads = face_detection_model.predict(frame) # Draw the outputs of the head detection algorithm if args.display_outputs: display_frame = face_detection_model.display_output( frame, list_heads) # --- HEAD POSE ESTIMATION --- # Extract the roi of the head with the highest confidence score head = list_heads[0] head_x_max = head.x + head.w head_y_max = head.y + head.h head_roi = frame[head.y:head_y_max, head.x:head_x_max, :] # Estimate the pose of the best head head_angles = head_pose_estimation_model.predict(head_roi) # Draw the pose of the best head if args.display_outputs: display_head_pose = head_pose_estimation_model.display_output( head_roi, head_angles) display_frame[head.y:head_y_max, head.x:head_x_max, :] = display_head_pose # --- FACIAL LANDMARKS DETECTION --- # Detect the facial landmarks on the head with the highest confidence score face_landmarks = facial_landmarks_detection_model.predict(head_roi) # Draw the facial landmarks of the best head if args.display_outputs: # Set display_name to True to display the name of the landmarks display_facial_landmarks = facial_landmarks_detection_model.display_output( display_head_pose, face_landmarks, display_name=True) display_frame[head.y:head_y_max, head.x:head_x_max, :] = display_facial_landmarks # --- GAZE ESTIMATION --- # Calculate the eye ROI size eye_roi_size = int(head_roi.shape[1] / 3) # Extract the roi of the left eyes left_eye_roi, left_eye_bbox = extract_landmark_roi( name='left_eye', landmarks=face_landmarks, roi_size=eye_roi_size, image=frame, origin_x=head.x, origin_y=head.y, ) # Extract the roi of the Rigth eyes right_eye_roi, right_eye_bbox = extract_landmark_roi( name='right_eye', landmarks=face_landmarks, roi_size=eye_roi_size, image=frame, origin_x=head.x, origin_y=head.y, ) # Predict the gaze gaze_vector = gaze_estimation_model.predict( left_eye_image=left_eye_roi, right_eye_image=right_eye_roi, head_angles=head_angles, ) # normalize the gaze vector based on the left eye left_eye_x_center = left_eye_bbox.x + int(left_eye_bbox.w / 2) left_eye_y_center = left_eye_bbox.y + int(left_eye_bbox.h / 2) start_vector = np.array([left_eye_x_center, left_eye_y_center, 0]) end_vector = np.array([ left_eye_x_center + gaze_vector.x, left_eye_y_center - gaze_vector.y, 0 + gaze_vector.z ]) vector = end_vector - start_vector norm_gaze_vector = vector / np.sqrt(np.dot(vector, vector)) # Draw the gaze output and the eyes ROI if args.display_outputs: # draw the bbox around each eyes display_frame = face_detection_model.display_output( display_frame, [left_eye_bbox, right_eye_bbox], color=(255, 255, 255), display_conf=False, ) # draw the gaze from both eyes display_frame = gaze_estimation_model.display_output( display_frame, norm_gaze_vector, [left_eye_bbox, right_eye_bbox], ) # Update position of the Computer Pointer if not args.disable_pointer_controller: pointer_controller.move(gaze_vector.x, gaze_vector.y) # Calculate the inference time stop_time = time.time() list_inference_time.append(stop_time - start_time) # Calculate and print the FPS fps = round(1 / (stop_time - start_time), 2) cv2.rectangle(display_frame, (10, 2), (120, 20), (255, 255, 255), -1) cv2.putText(display_frame, f"{fps} FPS", (15, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) # Display the frame cv2.imshow(WINDOW_NAME, display_frame) # Wait for 'ESC' or 'q' to exit the program keyboard = cv2.waitKey(30) if keyboard == 'q' or keyboard == 27: break # Release the input feeder input_feeder.close() # Destroy any OpenCV windows cv2.destroyAllWindows() # Display the average inference time and fps average_fps = round(1 / (mean(list_inference_time)), 2) print( f"[ INFO ] Average inference time was {mean(list_inference_time)}s ({average_fps} FPS)." ) print(f"[ INFO ] Successfully exited the program.")
def main(): # Grab command line args args = build_argparser().parse_args() inputFilePath = args.input inputFeeder = None if args.input == "CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video", args.input) modelPathDict = { 'FaceDetectionModel': args.face_detection_model, 'FacialLandmarksDetectionModel': args.facial_landmark_model, 'GazeEstimationModel': args.gaze_estimation_model, 'HeadPoseEstimationModel': args.head_pose_model } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified " + fileNameKey + " xml file") sys.exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: log.info("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) ''' if (not len(previewFlags)==0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0]-10, eye_coords[0][1]-10), (eye_coords[0][2]+10, eye_coords[0][3]+10), (0,255,0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0]-10, eye_coords[1][1]-10), (eye_coords[1][2]+10, eye_coords[1][3]+10), (0,255,0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText(preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(hp_out[0],hp_out[1],hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0]*12), int(gaze_vector[1]*12), 160 le =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(le, (x-w, y+w), (x+w, y-w), (255,0,255), 2) re = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(re, (x-w, y+w), (x+w, y-w), (255,0,255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3],eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3],eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace cv2.imshow("visualization",cv2.resize(preview_frame,(500,500))) ''' if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break log.info("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close()
def main(args): # models face_detection_model = args.face_detection_model head_pose_estimation_model = args.head_pose_estimation_model facial_landmarks_detection_model = args.facial_landmarks_detection_model gaze_estimation_model = args.gaze_estimation_model # toggles toggle_face_detect = int(args.toggle_face_detect) toggle_eye_detection = int(args.toggle_eye_detection) toggle_head_pose_euler_angles = int(args.toggle_head_pose_euler_angles) toggle_gaze_estimation_direction_lines = int( args.toggle_gaze_estimation_direction_lines) device = args.device video_file = args.video threshold = args.threshold output_path = args.output_path # model load times fd_start_model_load_time = time.time() fd = FaceDetectionModel(face_detection_model, device, threshold) fd.load_model() fd_total_model_load_time = time.time() - fd_start_model_load_time fld_start_model_load_time = time.time() fld = FacialLandmarksDetectionModel(facial_landmarks_detection_model, device) fld.load_model() fld_total_model_load_time = time.time() - fld_start_model_load_time hpe_start_model_load_time = time.time() hpe = HeadPoseEstimationModel(head_pose_estimation_model, device) hpe.load_model() hpe_total_model_load_time = time.time() - hpe_start_model_load_time ge_start_model_load_time = time.time() ge = GazeEstimationModel(gaze_estimation_model, device) ge.load_model() ge_total_model_load_time = time.time() - ge_start_model_load_time # mouse controller mouse_controller = MouseController('medium', 'fast') # Handle the input stream # see https://github.com/anvillasoto/people-counter-edge-application/blob/master/main.py if video_file == 'CAM': input_stream = 0 single_image_mode = False # Checks for input image elif video_file.endswith('.jpg') or video_file.endswith('.bmp'): single_image_mode = True input_stream = video_file elif (not video_file.endswith('.jpg')) or ( not (video_file.endswith('.bmp'))): input_stream = video_file assert os.path.isfile(video_file), "Input file does not exist" else: input_stream = video_file log.error("The file is unsupported.please pass a supported file") try: cap = cv2.VideoCapture(input_stream) except Exception as e: log.error(f"Something else went wrong with the video file: {e}") initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join(output_path, 'output.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (initial_w, initial_h), True) counter = 0 start_inference_time = time.time() try: while cap.isOpened(): ret, frame = cap.read() if not ret: break counter += 1 # detect face face_location, image = fd.predict(frame, toggle_face_detect) xmin, ymin, xmax, ymax = face_location[0] face_image = image[ymin:ymax, xmin:xmax].copy() # detect eyes eye_locations, eye_images, face_image_drawn = fld.predict( face_image, toggle_eye_detection) # detect head pose head_pose_angles, face_image_drawn = hpe.predict( face_image, face_image_drawn, toggle_head_pose_euler_angles) # gaze estimation gaze_vector, face_image_drawn = ge.predict( face_image_drawn, eye_images, head_pose_angles, eye_locations, toggle_gaze_estimation_direction_lines) # replace face with face image drawn (depending on toggle) image[ymin:ymax, xmin:xmax] = face_image_drawn x, y, z = gaze_vector if toggle_gaze_estimation_direction_lines == 1: # frame message to add gaze vector x, y, and z frame_message = "Gaze Coordinates: {:.2f}, {:.2f}, {:.2f}".format( x, y, z) image = cv2.putText(image, frame_message, (20, 20), cv2.FONT_HERSHEY_COMPLEX, 1, COLOR_WHITE_BGR, 2) out_video.write(image) # move mouse after five frames if counter % 5 == 0: mouse_controller.move(x, y) total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = counter / total_inference_time with open(os.path.join(output_path, 'stats.txt'), 'w') as f: f.write("Total Inference Time for Four Models: " + str(total_inference_time) + '\n') f.write("Frames Per Second for Four Models: " + str(fps) + '\n\n') f.write("Model Load Time (Face Detection): " + str(fd_total_model_load_time) + '\n') f.write("Model Load Time (Facial Landmark Detection): " + str(fld_total_model_load_time) + '\n') f.write("Model Load Time (Head Pose Estimation): " + str(hpe_total_model_load_time) + '\n') f.write("Model Load Time (Gaze Estimation): " + str(ge_total_model_load_time) + '\n') cap.release() cv2.destroyAllWindows() except Exception as e: print("Could not run Inference: ", e)
def main(): """ Initialise the inference network, stream video to network and output stats and video :param args: Command line arguments parsed by build_argsparser() :return: None """ # mouse movement ("low", "medium", "fast") global POSE_CHECKED mouse_movement = MouseController("low", "fast") logging.basicConfig(format="[ %(levelname)s ] %(message)s", level=logging.INFO, stream=sys.stdout) args = args_parser().parse_args() logging_message = logging.getLogger() if args.input == 'cam': input_feed = 0 else: input_feed = args.input assert os.path.isfile( args.input ), "Missing files or Specified input file doesn't exist or entered correctly" # Ref: source code: https://stackoverflow.com/questions/33834708/cant-write-video-by-opencv-in-python/33836463 # Ref: source code: https://knowledge.udacity.com/questions/275173 cap = cv2.VideoCapture(input_feed) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) duration = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) vout = cv2.VideoWriter(os.path.join(args.out_dir, "vout.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), True) if args.save_output == 'yes': vout_fd = cv2.VideoWriter(os.path.join(args.out_dir, "vout_fd.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), True) vout_fl = cv2.VideoWriter(os.path.join(args.out_dir, "vout_fl.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), True) vout_hp = cv2.VideoWriter(os.path.join(args.out_dir, "vout_hp.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), True) vout_ge = cv2.VideoWriter(os.path.join(args.out_dir, "vout_g.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), True) box_count = 0 working = 1 infer_time_start = time.time() if input_feed: cap.open(args.input) # Adjust delays to match the number of Frame Per Seconds in the video file if not cap.isOpened(): logging_message.error("ERROR MESSAGE! Corrupt video file") return if args.mode == 'sync': async_mode = False else: async_mode = True # Initialising the class variables # ref: https://github.com/gauravshelangia/computer-pointer-controller/blob/master/src/main.py if args.cpu_extension: fd_model = FaceDetectionModel(args.fdmodel, args.threshold, extensions=args.cpu_extension, async_mode=async_mode) hp_model = HeadPoseEstimationModel(args.hpmodel, args.threshold, extensions=args.cpu_extension, async_mode=async_mode) fl_model = FaceLandmarksDetectionModel(args.flmodel, args.threshold, extensions=args.cpu_extension, async_mode=async_mode) ge_model = GazeEstimationModel(args.gemodel, args.threshold, extensions=args.cpu_extension, async_mode=async_mode) else: fd_model = FaceDetectionModel(args.fdmodel, args.threshold, async_mode=async_mode) hp_model = HeadPoseEstimationModel(args.hpmodel, args.threshold, async_mode=async_mode) fl_model = FaceLandmarksDetectionModel(args.flmodel, args.threshold, async_mode=async_mode) ge_model = GazeEstimationModel(args.gemodel, args.threshold, async_mode=async_mode) # Load the model through ## # And infer network logging_message.info( "================ Models loading time ======================") start_time = time.time() fd_model.load_model() logging_message.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() hp_model.load_model() logging_message.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() fl_model.load_model() logging_message.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() ge_model.load_model() logging_message.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging_message.info( "========================== End ============================") model_load_time = time.time() - infer_time_start logging.info("All models are loaded successfully") while cap.isOpened(): flag, img_frame = cap.read() if not flag: print("checkpoint *UNRECORDED") break box_count += 1 gazing = 0 POSE_CHECKED = False if img_frame is None: logging.error("checkpoint ERROR! EMPTY FRAME") break width = int(cap.get(3)) height = int(cap.get(4)) # Asynchronous Request inf_start_fd = time.time() # Display the results of the output layer of the model network # ref source code: https://knowledge.udacity.com/questions/285095 values, img_frame = fd_model.predict(img_frame) if args.save_output == 'yes': vout_fd.write(img_frame) fd_dur_time = time.time() - inf_start_fd if len(values) > 0: [xmin, ymin, xmax, ymax] = values[0] head_is_moving = img_frame[ymin:ymax, xmin:xmax] inf_start_hp = time.time() person_in_frame, target_gaze = hp_model.predict(head_is_moving) if args.save_output == 'yes': p = "Target Gaze {}, Person in Frame? {}".format( target_gaze, person_in_frame) cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255), 2) vout_hp.write(img_frame) if person_in_frame: hp_dur_time = time.time() - inf_start_hp POSE_CHECKED = True inf_start_fl = time.time() values, marking = fl_model.predict(head_is_moving) img_frame[ymin:ymax, xmin:xmax] = marking if args.save_output == "yes": vout_fl.write(img_frame) fl_dur_time = time.time() - inf_start_fl [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax, yrmax]] = values l_eye_img = marking[ylmin:ylmax, xlmin:xlmax] r_eye_img = marking[yrmin:yrmax, xrmin:xrmax] output, gaze_vector = ge_model.predict(l_eye_img, r_eye_img, target_gaze) #ref: source code: https://knowledge.udacity.com/questions/264973 if args.save_output == 'yes': p = "Gaze Vector {}".format(gaze_vector) cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1) left_frame = draw_gaze(l_eye_img, gaze_vector) right_frame = draw_gaze(r_eye_img, gaze_vector) marking[ylmin:ylmax, xlmin:xlmax] = left_frame marking[yrmin:yrmax, xrmin:xrmax] = right_frame # cv2.arrowedLine(f, (xlmin, ylmin), (xrmin, yrmin), (0,0,255), 5) vout_ge.write(img_frame) if box_count % 10 == 0: mouse_movement.move(output[0], output[1]) # Drawing and documenting performance stat # ref: https://github.com/gauravshelangia/computer-pointer-controller/blob/master/src/main.py # ref source code: https://knowledge.udacity.com/questions/257795 inf_time_message = "Face Detection Inference time: {:.3f} ms.".format( fd_dur_time * 1000) # if POSE_CHECKED: cv2.putText( frame, "Head Pose Estimation Inference time: {:.3f} ms.".format( hp_dur_time * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.putText(img_frame, inf_time_message, (0, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 255), 2) vout.write(img_frame) if box_count % 10 == 0: print("Inference time = ", int(time.time() - infer_time_start)) print('Box count {} and duration {}'.format(box_count, duration)) if args.out_dir: final_infer_time = time.time() - infer_time_start with open(os.path.join(args.out_dir, 'stats.txt'), 'w') as marking: marking.write(str(round(final_infer_time, 1)) + '\n') marking.write(str(box_count) + '\n') if args.out_dir: with open(os.path.join(args.out_dir, 'stats.txt'), 'a') as marking: marking.write(str(round(model_load_time)) + '\n') # Clean all models fd_model.clean() hp_model.clean() fl_model.clean() ge_model.clean() # release cv2 cap cap.release() cv2.destroyAllWindows() # release all resulting ouputs writer vout.release() if args.save_output == 'yes': vout_fd.release() vout_hp.release() vout_fl.release() vout_ge.release()
def main(): # create log file log.basicConfig(filename='logs/cpc.log', level=log.INFO, format='%(asctime)s %(message)s') # Parse the argument args = parse_arguments().parse_args() print('Input arguments:') for key, value in vars(args).items(): print('\t{}: {}'.format(key, value)) print('') # list used to handle model load and inference time m_fd_load_time = [] m_hpe_load_time = [] m_fd_infer_time = [] m_hpe_infer_time = [] m_fld_infer_time = [] m_ge_infer_time = [] m_fld_load_time = [] m_ge_load_time = [] if args.input == 'CAM': input_feeder = InputFeeder("cam") else: # get the input value input_stream = args.input if not os.path.isfile(input_stream): log.error("Provided input video file doesn't exist/video path is wrong!") exit(1) # load the video file input_feeder = InputFeeder("video", input_stream) # get model path head_face_detection_model_name = args.face_detection head_pose_estimation_model_name = args.head_pose_estimation face_landmarks_detection_model_name = args.facial_landmarks_detection gaze_estimation_model_name = args.gaze_estimation # mouse controller mouse_controller = MouseController(precision='medium', speed='fast') # load the required models m_fd_load_start_time = time.time() # create and load face_detection model face_detection = FaceDetectionModel(model_name=head_face_detection_model_name, device=args.device, probs_threshold=args.prob_threshold) face_detection.load_model() m_fd_load_time.append(round(time.time() - m_fd_load_start_time, 5)) log.debug("Time taken to load Face detection model took {} seconds.".format(m_fd_load_time)) # create and load head_pose estimation model m_hpe_load_start_time = time.time() head_pose_estimation = HeadPoseEstimationModel(model_name=head_pose_estimation_model_name, device=args.device) head_pose_estimation.load_model() m_hpe_load_time.append(round(time.time() - m_hpe_load_start_time, 5)) log.debug("Time taken to load head pose estimation model took {} seconds.".format(m_hpe_load_time)) # create and load face landmarks detection model m_fld_load_start_time = time.time() face_landmark_detection = FacialLandmarksDetectionModel(model_name=face_landmarks_detection_model_name, device=args.device) face_landmark_detection.load_model() m_fld_load_time.append(round(time.time() - m_fld_load_start_time, 5)) log.debug("Time taken to load face landmark detection model took {} seconds.".format(m_fld_load_time)) # create and load face landmarks detection model m_ge_load_start_time = time.time() gaze_estimation = GazeEstimationModel(model_name=gaze_estimation_model_name, device=args.device) gaze_estimation.load_model() m_ge_load_time.append(round(time.time() - m_ge_load_start_time, 5)) log.debug("Time taken to load gaze estimation model took {} seconds.".format(m_ge_load_time)) # load the image data input_feeder.load_data() frame_count = 0 threshold_frame = 5 log.info("Video stream to perform gaze estimation is started!.") for flag, frame in input_feeder.next_batch(): if not flag: break # to handle better control with frame processing if frame_count % threshold_frame == 0: key_pressed = cv2.waitKey(60) if cv2.waitKey(1) & 0xFF == ord('q'): break if key_pressed == 27: break # invoke face detection prediction m_fd_infer_start_time = time.time() detected_face_image, detected_box = face_detection.predict_face_detection(frame, args.visualization_fd) m_fd_infer_end_time = time.time() m_fd_infer_time.append(m_fd_infer_end_time - m_fd_infer_start_time) # invoke head pose estimation prediction head_pose_estimation_output, frame = head_pose_estimation.predict_head_pose_estimation(frame, detected_face_image, args.visualization_hpe) m_hpe_infer_end_time = time.time() m_hpe_infer_time.append(m_hpe_infer_end_time - m_fd_infer_end_time) # invoke face landmark detection prediction left_eye_image, right_eye_image, = face_landmark_detection.predict_facial_landmarks_detection( detected_face_image, args.visualization_fld) m_fld_infer_end_time = time.time() m_fld_infer_time.append(m_fld_infer_end_time - m_hpe_infer_end_time) # invoke gaze estimation prediction mouse_coordinate, predicted_gaze_output = gaze_estimation.predict_gaze_estimation(left_eye_image, right_eye_image, head_pose_estimation_output) m_ge_infer_end_time = time.time() m_ge_infer_time.append(m_ge_infer_end_time - m_fld_infer_end_time) if args.visualization_ge: # get the output from face landmark detection outputs = face_landmark_detection.get_outputs() # get back the bounding box height = detected_face_image.shape[0] width = detected_face_image.shape[1] left_eye_x = int(outputs[0] * width + detected_box[0]) left_eye_y = int(outputs[1] * height + detected_box[1]) right_eye_x = int(outputs[2] * width + detected_box[0]) right_eye_y = int(outputs[3] * height + detected_box[1]) eye_bounding_box = [left_eye_x, left_eye_y, right_eye_x, right_eye_y] gaze_estimation.draw_gaze_estimation(eye_bounding_box, predicted_gaze_output, frame) # show the results cv2.imshow('ComputerPointer', frame) mouse_controller.move(mouse_coordinate[0], mouse_coordinate[1]) frame_count = frame_count + 1 log.info("Completed gaze estimation for the provided video!.") log.info("Mean time taken to run Face detection inference took {} seconds.".format(statistics.mean(m_fd_infer_time))) log.info( "Mean time taken to run Head pose estimation inference took {} seconds.".format(statistics.mean(m_hpe_infer_time))) log.info("Mean time taken to run Face Landmark detection inference took {} seconds.".format( statistics.mean(m_fld_infer_time))) log.info("Mean time taken to run Gaze estimation inference took {} seconds.".format(statistics.mean(m_ge_infer_time))) # to perform model inference analysis # analyze_model_inference_time(m_fd_infer_time, m_hpe_infer_time, m_fld_infer_time, m_ge_infer_time, "FP32") # clean up resources input_feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() previewFlags = args.previewFlags logger = logging.getLogger() inputFilePath = args.input inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) modelPathDict = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() start = time.time() fdm.load_model() fdmload_time = time.time() - start load_time_message = "Loading time for Face Detection Model: {:.3f}ms".format( fdmload_time * 1000) print(load_time_message) start = time.time() fldm.load_model() fldmload_time = time.time() - start load_time_message = "Loading time for Facial Landmark Model: {:.3f}ms".format( fldmload_time * 1000) print(load_time_message) start = time.time() hpem.load_model() hpemload_time = time.time() - start load_time_message = "Loading time for Head Pose Estimation Model: {:.3f}ms".format( hpemload_time * 1000) print(load_time_message) start = time.time() gem.load_model() gemload_time = time.time() - start load_time_message = "Loading time for Gaze Estimation Model: {:.3f}ms".format( gemload_time * 1000) print(load_time_message) total_load_time = gemload_time + fdmload_time + hpemload_time + fldmload_time load_time_message = "Loading time for all the Models: {:.3f}ms".format( total_load_time * 1000) print(load_time_message) frame_count = 0 inf_start = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) total_inf_time = time.time() - inf_start inf_time_message = "Total Inference Time: {:.3f}s".format( total_inf_time) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) if (not len(previewFlags) == 0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close() print(inf_time_message) fps = frame_count / total_inf_time fps_message = "Total FPS: {:.3f} fps".format(fps) print(fps_message)
def main(): args = build_argparser().parse_args() logger = logging.getLogger() # video file inputFilePath = args.input inputFeeder = None # check to see if the user wants to use a video or camera feed if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) # get model files from the command line modelPathDict = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } # check if all files are accessible and correct for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) # initializing the 4 models fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') # loading models inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() # starting frame by frame inference frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) # getting the cropped face from the Face Detecction model croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue # getting the head post estimation output hp_out = hpem.predict(croppedFace.copy()) # getting the coordinated for the facial landmarks using the cropped image from the FacialRecognition model as input left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) # getting the mouse coordinated by feeding the outputs from landmark and head-pose as inputs to the gaze detection new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) # moving the mouse to the new position if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("Video ended, exiting...") cv2.destroyAllWindows() inputFeeder.close()
def start_infer(args): face_model = args.facemodel gaze_model = args.gazemodel head_model = args.headmodel landmarks_model = args.landmarksmodel device = args.device extensions = args.cpu_extension input_type = args.inputtype.lower() input_file = args.inputfile threshold = args.threshold benchmark = args.benchmark preview = args.preview feed = None key_pressed = None if input_type == "cam": feed = InputFeeder(input_type="cam") else: if not os.path.isfile(input_file): log.error("cannot find file {}".format(input_file)) exit(1) feed = InputFeeder(input_type="video", input_file=input_file) face_network = FaceDetectionModel(model_name=face_model, device=device, threshold=threshold, extensions=extensions) head_network = HeadPoseModel(model_name=head_model, device=device, threshold=threshold, extensions=extensions) landmarks_network = FacialLandmarksModel(model_name=landmarks_model, device=device, threshold=threshold, extensions=extensions) gaze_network = GazeModel(model_name=gaze_model, device=device, threshold=threshold, extensions=extensions) mouse_control = MouseController("medium", "fast") face_network.load_model() head_network.load_model() landmarks_network.load_model() gaze_network.load_model() feed.load_data() try: for flag, frame in feed.next_batch(): if not flag: break if not benchmark: key_pressed = cv2.waitKey(60) face_output, cropped_face_frame = face_network.predict([frame]) head_output, cropped_face_frame = head_network.predict( [cropped_face_frame]) landmarks_output, cropped_eyes = landmarks_network.predict( [cropped_face_frame]) mouse_coords, gaze_output = gaze_network.predict( [head_output, cropped_eyes[0], cropped_eyes[1]]) # disable preview and mouse control while benchmarking # to make it more accurate if not benchmark: # Input user from preview argument if preview: nframe = draw_output( cropped_face_frame, head_output, landmarks_output, gaze_output, ) cv2.imshow("preview", nframe) # added pyautogui.FAILSAFE = False to mouse controller class # to prevent PyAutoGUI fail-safe messeges when mouse reaches the screen edge # mouse_control.move(mouse_coords[0], mouse_coords[1]) mouse_control.move(mouse_coords[0], mouse_coords[1]) if key_pressed == 27: break except Exception as e: log.error( "error while predicting input source, more details as below:\n{}". format(e)) # save benchmarks values to output directory if benchmark: face_network.print_benchmark() head_network.print_benchmark() landmarks_network.print_benchmark() gaze_network.print_benchmark() cv2.destroyAllWindows() feed.close()
def main(): # Grab command line args args = build_argparser().parse_args() logger = logging.getLogger() inputFilePath = args.input inputFeeder = None inference_time = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) #else: # if not os.path.isfile(inputFilePath): # logger.error("Unable to find specified image file") # exit(1) # inputFeeder = InputFeeder("image",inputFilePath) # Initialize variables with the input arguments modelPathDict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarksDetectionModel': args.FacialLandmarksDetectionModel, 'GazeEstimationModel': args.gazeEstimationModel, 'HeadPoseEstimationModel': args.HeadPoseEstimationModel } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) flm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpe = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('high', 'fast') inputFeeder.load_data() # Load Models and generate load times start_time = time.time() fdm.load_model() logger.error("Face detection model loaded: time: {:.3f} ms".format( (time.time() - start_time) * 1000)) first_mark = time.time() flm.load_model() logger.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format( (time.time() - first_mark) * 1000)) second_mark = time.time() hpe.load_model() logger.error("Head pose estimation model loaded: time: {:.3f} ms".format( (time.time() - second_mark) * 1000)) third_mark = time.time() gem.load_model() logger.error("Gaze estimation model loaded: time: {:.3f} ms".format( (time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger.error("Required models have been loaded..") frame_count = 0 start_inf_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (600, 800))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) hp_out = hpe.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = flm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) inference_time = round(time.time() - start_inf_time, 1) total_frames = int(frame_count) fps = int(frame_count) / (inference_time) logger.error("count {} seconds".format(frame_count)) logger.error("total inference time {} seconds".format(inference_time)) logger.error("total frames {} frames".format(frame_count)) logger.error("fps {} frame/second".format(fps)) with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'RunReport.txt'), 'w') as R: R.write('Load Time: ' + str(load_total_time) + '\n') R.write('Inference Time :' + str(inference_time) + '\n') R.write('total frames processed' + str(total_frames) + '\n') R.write('fps: ' + str(fps) + '\n') logger.error("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close() atexit.register(profile.print_stats)
def main(): # Grabing command line args args = build_argparser().parse_args() # Getting Input File Path inputFilePath = args.input # For Visualization visual_flag = args.visualization_flag # Initialize inputfeeder inputFeeder = None # Handle video file or CAM (like webcam) if args.input =="CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video",args.input) # Now define model path dictionary for all 04 intel pre trained models modelPathDict = {'FaceDetectionModel':args.face_detection_model, 'FacialLandmarksDetectionModel':args.facial_landmark_model, 'GazeEstimationModel':args.gaze_estimation_model, 'HeadPoseEstimationModel':args.head_pose_model} # Check model XML file for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified "+fileNameKey+" xml file") sys.exit(1) # Defining Intel Pre Trained Models Objects fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel(modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) # Determining Precision and Speed for mouse controller mc = MouseController('medium','fast') # Loading Input Feeder inputFeeder.load_data() # Loading our four pre trained models and calculate the total models loading time # This will help us to find different model time for different models precison like F32,F16 & INT8 start_time_1= time.time() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() total_model_load_time= (time.time()-start_time_1) print("Total Model Load Time for All our Intel Pre Trained Models is (in seconds): {:.3f}".format(total_model_load_time)) # Above print statement will give total model load time for our 04 models for different precisions as well frame_count = 0 start_time = time.time() # Start Loop till break through input feeder for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count+=1 if frame_count%5==0: cv2.imshow('video',cv2.resize(frame,(450,450))) key = cv2.waitKey(60) # Extracting face detection features croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace)==int: log.info("Unable to detect the face.") if key==27: break continue # Head position detection hp_out = hpem.predict(croppedFace.copy()) # Landmarks detection (left_eye, right_eye, eyes coordinates) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) # Mouse coordinates and gaze vector Detection new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) # Creating variables for visualisation # Extracting four face coordinates for rectangle (xmin,ymin,xmax,ymax) x_minimum= face_coords[0] y_minimum=face_coords[1] x_maximum=face_coords[2] y_maximum=face_coords[3] # Take eye surrounding area eye_surrounding_area=10 # Now extracting few features from eye coordinates # Extracting four coordinates of left eye from eye coordinates l_l1= eye_coords[0][0] l_l2=eye_coords[0][1] l_l3=eye_coords[0][2] l_l4=eye_coords[0][3] # Extracting four coordinates of left eye from eye coordinates r_r1=eye_coords[1][0] r_r2=eye_coords[1][1] r_r3=eye_coords[1][2] r_r4=eye_coords[1][3] # Extracting pose angle, pitch and roll from head pose output pose_angle= hp_out[0] pitch=hp_out[1] roll=hp_out[2] # Visualizing face, landmarks, head pose and gaze if (not len(visual_flag)==0): preview_frame = frame.copy() if 'fd' in visual_flag: # Drawing a rectangle with our four face coordiantes (xmin,ymin,xmax,ymax) cv2.rectangle(preview_frame, (x_minimum, y_minimum), (x_maximum, y_maximum), (20,20,150), 3) if 'fld' in visual_flag: # Drawing a rectangle for each eyes with the help of eye coordinates and eye surrounding area # Left Eye cv2.rectangle(preview_frame, (l_l1-eye_surrounding_area, l_l2-eye_surrounding_area), (l_l3+eye_surrounding_area, l_l4+eye_surrounding_area), (60,255,0), 2) # Right Eye cv2.rectangle(preview_frame, (r_r1-eye_surrounding_area, r_r2-eye_surrounding_area), (r_r3+eye_surrounding_area, r_r4+eye_surrounding_area), (60,255,0), 2) if 'hp' in visual_flag: # We have extracted pose angle, pitch and roll from head pose output, now we put text on preview_frame cv2.putText(preview_frame, "Pose Angles:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(pose_angle, pitch, roll), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 60), 1) if 'ge' in visual_flag: # Calculating coordinates for left eye to obtain left eye center le_x= (l_l1 + l_l3)/2 le_y= (l_l2 + l_l4)/2 # Calculating coordinates for right eye to obtain right eye center re_x= (r_r1 + r_r3)/2 re_y= (r_r2 + r_r4)/2 # Calculating left eye center le_center= int(x_minimum + le_x), int(y_minimum + le_y) # Calculating right eye center re_center= int(x_minimum + re_x), int(y_minimum + re_y) # Now put both eyes center in a list eyes_center = [le_center, re_center ] # Extracting left eye x and y coordinates from eyes_center le_center_x = int(eyes_center[0][0]) le_center_y = int(eyes_center[0][1]) # Extracting right eye x and y coordinates from eyes_center re_center_x = int(eyes_center[1][0]) re_center_y = int(eyes_center[1][1]) # Extracting x and y (first and second) value from gaze_vector g_x, g_y = gaze_vector[0:2] # With the help of above parameters, draw arrowed lines for gaze on left and right eyes cv2.arrowedLine(preview_frame, (le_center_x, le_center_y), (le_center_x + int(g_x * 100), le_center_y + int(-g_y * 100)), (0,50,160), 1) cv2.arrowedLine(preview_frame, (re_center_x, re_center_y), (re_center_x + int(g_x * 100), re_center_y + int(-g_y * 100)), (0,50,160), 1) cv2.imshow("visualization",cv2.resize(preview_frame,(450,450))) if frame_count%5==0: mc.move(new_mouse_coord[0],new_mouse_coord[1]) if key==27: break log.info("VideoStream has been ended") cv2.destroyAllWindows() inputFeeder.close() # Calculating Inference time and frame per seconds total_time = time.time() - start_time total_inference_time=total_time fps=frame_count/total_inference_time print("Inference time: {:.3f}".format(total_inference_time)) print("FPS: {}".format(fps))
def main(): args = build_argparser().parse_args() inputFilePath = args.input inputFeeder = None if args.input == "CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video", args.input) modelPathDict = { 'FaceDetectionModel': args.face_detection_model, 'FacialLandmarksDetectionModel': args.facial_landmark_model, 'GazeEstimationModel': args.gaze_estimation_model, 'HeadPoseEstimationModel': args.head_pose_model } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified " + fileNameKey + " xml file") sys.exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') start_time_1 = time.time() inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() total_model_load_time = (time.time() - start_time_1) print("Model Load Time: {:.3f}".format(total_model_load_time)) frame_count = 0 start_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (450, 450))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: log.info("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break log.info("VideoStream has ended.") cv2.destroyAllWindows() inputFeeder.close() total_time = time.time() - start_time total_inference_time = total_time fps = frame_count / total_inference_time print("Inference Time: {:.3f}".format(total_inference_time)) print("FPS: {}".format(fps))
class Inferencer: def __init__(self, device='CPU', mouse_con=False, face_dec=None, fac_land=None, head_pose=None, gaze=None, show_video=False, save_video=False): ''' all models should be put in here ''' if face_dec and fac_land and head_pose and gaze: self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel( face_dec, device=device), FacialLandmarksDetection( fac_land, device=device), Head_Pose_Estimation( head_pose, device=device), Gaze_Estimation(gaze, device=device) self.face_dec.load_model() self.fac_land.load_model() self.head_pose.load_model() self.gaze.load_model() else: raise ValueError('Missing Arguments') if mouse_con: self.mouse_con = MouseController("low", "fast") self.show_video, self.save_video = show_video, save_video def __call__( self, input_type=None, input_file=None, ): self.run(input_type=input_type, input_file=input_file) def run( self, input_type=None, input_file=None, ): if input_type and input_file: self.input_ = InputFeeder(input_type, input_file) self.input_.load_data() if self.save_video: out = cv2.VideoWriter( 'output.mp4', 0x00000021, 30, (int(self.input_.cap.get(3)), int(self.input_.cap.get(4)))) try: fc_dec_inf_time = 0 landmark_inf_time = 0 pose_inf_time = 0 gaze_inf_time = 0 frame_counter = 0 while True: # Read the next frame try: frame = next(self.input_.next_batch()) frame_counter += 1 except StopIteration: break key_pressed = cv2.waitKey(60) # face detection start = time.time() out_frame, boxes = self.face_dec.predict(frame, display_output=True) fc_dec_inf_time += (time.time() - start) #for each box for box in boxes: face = out_frame[box[1]:box[3], box[0]:box[2]] start = time.time() out_frame, left_eye_point, right_eye_point = self.fac_land.predict( out_frame, face, box, display_output=True) landmark_inf_time += (time.time() - start) start = time.time() out_frame, headpose_angels = self.head_pose.predict( out_frame, face, box, display_output=True) pose_inf_time += (time.time() - start) start = time.time() out_frame, gazevector = self.gaze.predict( out_frame, face, box, left_eye_point, right_eye_point, headpose_angels, display_output=True) gaze_inf_time += (time.time() - start) if self.show_video: cv2.imshow('im', out_frame) if self.save_video: out.write(out_frame) if self.mouse_con: self.mouse_con.move(gazevector[0], gazevector[1]) time.sleep(1) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break if self.save_video: out.release() self.input_.close() cv2.destroyAllWindows() print( 'average inference time for face detection model is :- {:2f}ms' .format((fc_dec_inf_time / frame_counter) * 1000)) print( 'average inference time for facial landmark model is :- {:2f}ms' .format((landmark_inf_time / frame_counter) * 1000)) print( 'average inference time for head pose estimation model is :- {:2f}ms' .format((pose_inf_time / frame_counter) * 1000)) print( 'average inference time for gaze estimation model is :- {:2f}ms' .format((gaze_inf_time / frame_counter) * 1000)) except Exception as ex: logging.exception("Error in inference: " + str(ex))
def main(): """ """ # Grab command line args args = build_argparser().parse_args() input_src = args.input device = args.device extension = args.cpu_extension prob_threshold = args.prob_threshold face_detection_model = args.facedetectionmodel head_pose_model = args.headposemodel landmarks_model = args.facelandmarksnmodel gaze_estimation_model = args.gazeestimationmodel # Create log object set for console output and set log level log_obj = log.getLogger() log_obj.setLevel(LOGLEVEL) console_handler = log.StreamHandler() console_handler.setLevel(LOGLEVEL) log_obj.addHandler(console_handler) # Create detection objects face_detection_obj = FaceDetectionModel(face_detection_model, device, extension) head_pose_obj = HeadPoseModel(head_pose_model, device, extension) landmarks_obj = LandmarksModel(landmarks_model, device, extension) gaze_estimation_obj = GazeEstimationModel(gaze_estimation_model, device, extension) # Create mouse controller object mouse_controller = MouseController('medium', 'fast') # Place mouse at the center of the screen mouse_controller.init_position() log_obj.info("[Info]: Place mouse at the center of the screen") # Place holder for total inferencing time total_inference_time = 0 # Load models and get the model loading times start_time = time.time() face_detection_obj.load_model() end_time = time.time() face_detection_loading_time = end_time - start_time start_time = time.time() head_pose_obj.load_model() end_time = time.time() head_pose_loading_time = end_time - start_time start_time = time.time() landmarks_obj.load_model() end_time = time.time() landmarks_detection_loading_time = end_time - start_time start_time = time.time() gaze_estimation_obj.load_model() end_time = time.time() gaze_estimation_loading_time = end_time - start_time # Configure input video source if input_src.lower() == 'cam': input_channel = InputFeeder(input_type='cam') elif not os.path.exists(input_src): log.error("Video file not found! Exiting....") exit(1) else: input_channel = InputFeeder(input_type='video', input_file=input_src) log_obj.info("[Info]: Opening video file ...") input_channel.load_data() video_width = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) video_height = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_channel.cap.get(cv2.CAP_PROP_FPS)) frame_counter = 0 total_face_inf_time = 0 total_head_inf_time = 0 total_lanmarks_inf_time = 0 total_gaze_inf_time = 0 frame_processing_time = 0 # Process each frame try: for frame in input_channel.next_batch(): frame_processing_start_time = time.time() frame_counter = frame_counter + 1 key = cv2.waitKey(60) # Use face detection to find cropped face and provide face coordinates cropped_face, face_coords, face_inference_time = face_detection_obj.predict( frame, prob_threshold) total_face_inf_time = total_face_inf_time + face_inference_time # Now use cropped face for head pose detection head_pose_estimate, head_inference_time = head_pose_obj.predict( cropped_face, prob_threshold) total_head_inf_time = total_head_inf_time + head_inference_time # Now use cropped face for landmarks detection cropped_left_eye, cropped_right_eye, eyes_coords, converted_landmarks, landmarks_inference_time = landmarks_obj.predict( cropped_face, prob_threshold) total_lanmarks_inf_time = total_lanmarks_inf_time + landmarks_inference_time # Finally gaze estimation gaze_vector, gaze_estimate_time = gaze_estimation_obj.predict( cropped_left_eye, cropped_right_eye, head_pose_estimate) total_gaze_inf_time = total_gaze_inf_time + gaze_estimate_time # Move the mouse #mouse_controller.move(gaze_vector[0], gaze_vector[1]) # Show size-reduced frame for visual comparison # Check potential visualize flags: 'F', 'H', 'L', 'G' # If flag exist, process image to show inference results if args.visualize is not None: visualize_flag = str(args.visualize) # Draw bounding box around detected face if 'F' in visualize_flag: cv2.rectangle(frame, (face_coords[0][0], face_coords[0][1]), (face_coords[0][2], face_coords[0][3]), (0, 255, 0), 2) # Show head pose parameters if 'H' in visualize_flag: cv2.putText( frame, "Head pose: yaw: {:.3f}, pitch: {:.3f}, roll: {:.3f}". format(head_pose_estimate[0], head_pose_estimate[1], head_pose_estimate[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) # Draw dots on detected facial landmarks if 'L' in visualize_flag: cv2.circle(frame, (converted_landmarks[0] + face_coords[0][0], converted_landmarks[1] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[2] + face_coords[0][0], converted_landmarks[3] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[4] + face_coords[0][0], converted_landmarks[5] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[6] + face_coords[0][0], converted_landmarks[7] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[8] + face_coords[0][0], converted_landmarks[9] + face_coords[0][1]), 10, (0, 255, 0), 5) # Display gaze parameters if 'G' in visualize_flag: cv2.putText( frame, "Gaze estimate: x: {:.3f}, y: {:.3f}, z: {:.3f}". format(gaze_vector[0], gaze_vector[1], gaze_vector[2]), (10, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) resized_frame = cv2.resize(frame, (640, 360)) cv2.imshow('frame', resized_frame) if frame_counter % 4 == 0: mouse_controller.move(gaze_vector[0], gaze_vector[1]) frame_processing_time = frame_processing_time + ( time.time() - frame_processing_start_time) * 1000 if key == 27: break except Exception as e: #traceback.print_exc() if 'shape' in str(e): log_obj.info("Video feed finished") else: log_obj.error("[ERROR]: " + str(e)) pass # All done, cleaning up cv2.destroyAllWindows() input_channel.close() # Print out statistics log_obj.info("[Info]: Video source FPS: " + str(fps)) log_obj.info("[Info]: Total frame count: " + str(frame_counter)) log_obj.info("") log_obj.info("[Info]: Face detection model loading time: {:.3f} ms".format( face_detection_loading_time * 1000)) log_obj.info("[Info]: Head pose model loading time: {:.3f} ms".format( head_pose_loading_time * 1000)) log_obj.info( "[Info]: Facial landmarks detection model loading time: {:.3f} ms". format(landmarks_detection_loading_time * 1000)) log_obj.info( "[Info]: Gaze estimation model loading time: {:.3f} ms".format( gaze_estimation_loading_time * 1000)) log_obj.info("") log_obj.info( "[Info]: Average per frame total processing time : {:.3f} ms".format( frame_processing_time / frame_counter)) log_obj.info("[Info]: Average face inferencing time: {:.3f} ms".format( total_face_inf_time / frame_counter)) log_obj.info( "[Info]: Average head pose inferencing time: {:.3f} ms".format( total_head_inf_time / frame_counter)) log_obj.info( "[Info]: Average facial landmarks inferencing time: {:.3f} ms".format( total_lanmarks_inf_time / frame_counter)) log_obj.info("[Info]: Average gaze estimate time: {:.3f} ms".format( total_gaze_inf_time / frame_counter))
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args): face_detection_model_file = args.faceDetectionModel facial_landmarks_detection_model_file = args.facialLandmarksModel head_pose_estimation_model_file = args.headPoseModel gaze_estimation_model_file = args.gazeModel video_file = args.input device_name = args.device cpu_extension = args.cpu_extension prob_threshold = args.prob_threshold preview_flag = args.preview_flag output_path = args.output_path if not os.path.exists(output_path): os.mkdir(output_path) mouse_control = MouseController("low", "fast") try: logging.info("*********** Model Load Time ***************") start_model_load_time = time.time() start_time = time.time() face_detection_model = FaceDetectionModel(face_detection_model_file, device_name, cpu_extension) logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() facial_landmarks_detection_model = FacialLandmarksDetectionModel( facial_landmarks_detection_model_file, device_name, cpu_extension) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() head_pose_estimation_model = HeadPoseEstimationModel( head_pose_estimation_model_file, device_name, cpu_extension) logging.info("Head Pose Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file, device_name, cpu_extension) logging.info("Gaze Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) total_model_load_time = time.time() - start_model_load_time logging.info("*********** Model Load Completed ***********") except Exception as e: logging.error("ERROR in model loading: " + str(e)) sys.exit(1) feeder = InputFeeder('video', video_file) feeder.load_data() out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.fps() / 10), (1920, 1080), True) start_inference_time = 0 frame_count = 0 face_detect_infer_time = 0 facial_landmarks_infer_time = 0 head_pose_infer_time = 0 gaze_infer_time = 0 while True: try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 ## Face Detecton Model image = face_detection_model.preprocess_input(frame) start_time = time.time() outputs = face_detection_model.predict(image) face_detect_infer_time += (time.time() - start_time) out_frame, faces = face_detection_model.preprocess_output( outputs, frame, preview_flag, prob_threshold) for face in faces: crop_image = frame[face[1]:face[3], face[0]:face[2]] ## Facial Landmarks Detecton Model image = facial_landmarks_detection_model.preprocess_input( crop_image) start_time = time.time() outputs = facial_landmarks_detection_model.predict(image) facial_landmarks_infer_time += (time.time() - start_time) out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output( outputs, out_frame, face, preview_flag) ## Head Pose Estimation Model image = head_pose_estimation_model.preprocess_input(crop_image) start_time = time.time() outputs = head_pose_estimation_model.predict(image) head_pose_infer_time += (time.time() - start_time) out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output( outputs, out_frame, preview_flag) ## Gaze Estimation Model out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input( out_frame, crop_image, left_eye_point, right_eye_point) start_time = time.time() outputs = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels_list) gaze_infer_time += (time.time() - start_time) out_frame, gazevector = gaze_estimation_model.preprocess_output( outputs, out_frame, face, left_eye_point, right_eye_point, preview_flag) cv2.imshow("Computer Pointer Control", out_frame) out_video.write(out_frame) mouse_control.move(gazevector[0], gazevector[1]) if key_pressed == 27: break if frame_count > 0: logging.info("*********** Model Inference Time ****************") logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * face_detect_infer_time / frame_count)) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * facial_landmarks_infer_time / frame_count)) logging.info("Head Pose Detection Model: {:.1f} ms.".format( 1000 * head_pose_infer_time / frame_count)) logging.info("Gaze Detection Model: {:.1f} ms.".format( 1000 * gaze_infer_time / frame_count)) logging.info("*********** Model Inference Completed ***********") total_infer_time = time.time() - start_inference_time total_inference_time = round(total_infer_time, 1) fps = frame_count / total_inference_time with open(os.path.join(output_path, 'stats.txt'), 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logging.info("*********** Total Summary ****************") logging.info(f"Total Model Load Time: {total_model_load_time}") logging.info(f"Total Inference Time: {total_inference_time}") logging.info(f"FPS: {fps}") logging.info("*********** Total Summary ***********") logging.info("*********** ************************* ***********") feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() preview_flags = args.preview_flags logger = logging.getLogger() input_path = args.input if input_path.lower() == 'cam': input_feed = InputFeeder('cam') else: if not os.path.isfile(input_path): logger.error('Unable to find specified video file') exit(1) file_extension = input_path.split(".")[-1] if (file_extension in ['jpg', 'jpeg', 'bmp']): input_feed = InputFeeder('image', input_path) elif (file_extension in ['avi', 'mp4']): input_feed = InputFeeder('video', input_path) else: logger.error( "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']" ) exit(1) if sys.platform == "linux" or sys.platform == "linux2": #CODEC = 0x00000021 CODEC = cv2.VideoWriter_fourcc(*"mp4v") elif sys.platform == "darwin": CODEC = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') else: print("Unsupported OS.") exit(1) file_flag = False if args.output_file.lower() == 'y': file_flag = True out = cv2.VideoWriter('output.mp4', CODEC, 30, (FRAME_WIDTH, FRAME_HEIGHT)) modelPathDict = { 'face_detect': args.face_detection_model, 'face_landmark_regress': args.facial_landmark_model, 'head_pose': args.head_pose_model, 'gaze_estimate': args.gaze_estimation_model } for pathname in modelPathDict: if not os.path.isfile(modelPathDict[pathname]): logger.error('Unable to find specified ' + pathname + ' xml file') exit(1) #initializing models fdm = FaceDetectionModel(modelPathDict['face_detect'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['face_landmark_regress'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['head_pose'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['gaze_estimate'], args.device, args.cpu_extension) #initializing mouse controller mouse_controller = MouseController('medium', 'fast') input_feed.load_data() #checking models fdm.check_model() fldm.check_model() hpem.check_model() gem.check_model() #loading models / creating executable network fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in input_feed.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) """ Sequence of model execution:- 1. Predict from each model. 2. Preprocess of outputs from each model. 3. Send the processed output to the next model. Model Sequence:- - Head Pose Estimation Model - Face Detection Model <(First Head Pose and Then Facial Landmark)>Gaze Estimation Model - Facial Landmark Detection Model - """ cropped_face, face_coords = fdm.preprocess_output( frame.copy(), fdm.predict(frame.copy()), args.prob_threshold) if type(cropped_face) == int: logger.error('Unable to detect the face.') if key == 27: break continue hp_out = hpem.preprocess_output(hpem.predict(cropped_face.copy())) left_eye, right_eye, eye_coords = fldm.preprocess_output( cropped_face.copy(), fldm.predict(cropped_face.copy())) new_mouse_coord, gaze_vector = gem.preprocess_output( gem.predict(left_eye, right_eye, hp_out), hp_out) if (not len(preview_flags) == 0) or file_flag: preview_frame = frame.copy() if 'fd' in preview_flags: preview_frame = cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 0, 255), 3) cropped_face = preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] if 'fld' in preview_flags: cropped_face = cv2.rectangle( cropped_face, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cropped_face = cv2.rectangle( cropped_face, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if 'hp' in preview_flags: cv2.putText( preview_frame, 'Pose Angles: yaw: {:.2f} | pitch: {:.2f} | roll: {:.2f}'. format(hp_out[0], hp_out[1], hp_out[2]), (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2) if 'ge' in preview_flags: x, y = int(gaze_vector[0] * GAZE_ARROW_LENGTH), -int( gaze_vector[1] * GAZE_ARROW_LENGTH) le_mid_x = int((eye_coords[0][0] + eye_coords[0][2]) / 2) le_mid_y = int((eye_coords[0][1] + eye_coords[0][3]) / 2) re_mid_x = int((eye_coords[1][0] + eye_coords[1][2]) / 2) re_mid_y = int((eye_coords[1][1] + eye_coords[1][3]) / 2) cv2.arrowedLine(cropped_face, (le_mid_x, le_mid_y), ((le_mid_x + x), (le_mid_y + y)), (255, 0, 0), GAZE_ARROW_WIDTH) cv2.arrowedLine(cropped_face, (re_mid_x, re_mid_y), ((re_mid_x + x), (re_mid_y + y)), (255, 0, 0), GAZE_ARROW_WIDTH) preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if (not len(preview_flags) == 0) and frame_count % 2 == 0: if args.zoomed: cv2.imshow( 'Cropped Face', cv2.resize(cropped_face, (FRAME_WIDTH, FRAME_HEIGHT))) else: cv2.imshow( 'Preview', cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT))) if file_flag: out.write( cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT))) #move the mouse pointer try: mouse_controller.move(new_mouse_coord[0], new_mouse_coord[1]) except pyautogui.FailSafeException: pass if frame_count % 2 == 0 and len(preview_flags) == 0: cv2.imshow('Video', cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT))) if key == 27: break logger.error('VideoStream ended.') if args.output_file.lower() == 'y': out.release() input_feed.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() logger = logging.getLogger() if args.input_type == 'video' or args.input_type == 'image': extension = str(args.input).split('.')[1] feeder = InputFeeder(args.input_type, args.input) elif args.input_type == 'cam': feeder = InputFeeder(args.input_type) mc = MouseController("medium", "fast") feeder.load_data() face_model = FaceDetectionModel(args.facedetectionmodel, args.device, args.cpu_extension) face_model.check_model() landmark_model = Landmark_Model(args.facelandmarkmodel, args.device, args.cpu_extension) landmark_model.check_model() gaze_model = Gaze_Estimation_Model(args.gazeestimationmodel, args.device, args.cpu_extension) gaze_model.check_model() head_model = Head_Pose_Model(args.headposemodel, args.device, args.cpu_extension) head_model.check_model() face_model.load_model() logger.info("Face Detection Model Loaded...") landmark_model.load_model() logger.info("Landmark Detection Model Loaded...") head_model.load_model() logger.info("Head Pose Detection Model Loaded...") gaze_model.load_model() logger.info("Gaze Estimation Model Loaded...") logger.info('All Models are loaded\n\n') out = cv2.VideoWriter('output_video.mp4', 0x00000021, 30, (500, 500)) frame_count = 0 for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) faceROI = None if True: faceROI, box = FaceDetectionModel.predict(frame.copy(), args.prob_threshold) if faceROI is None: logger.error("Unable to detect the face.") if key == 27: break continue (lefteye_x, lefteye_y), ( righteye_x, righteye_y ), eye_coords, left_eye, right_eye = FaceLandmarkModel.predict( faceROI.copy(), EYE_ROI=10) head_position = HeadPoseModel.predict(faceROI.copy()) new_mouse_coord, gaze_vector = EyeGazeModel.predict( left_eye.copy(), right_eye.copy(), head_position) if (not len(previewFlags) == 0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle( croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle( croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace #cv2.imshow("visualization",cv2.resize(preview_frame,(500,500))) out.write(frame) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") out.release() cv2.destroyAllWindows() inputFeeder.close()
def main(): args = build_argparser().parse_args() Flags = args.Flags logger = logging.getLogger() inputFilePath = args.input inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) Dir = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } for fileKey in Dir.keys(): if not os.path.isfile(Dir[fileKey]): logger.error("Unable to find specified " + fileKey + " xml file") exit(1) Fd = FaceDetectionModel(Dir['FaceDetectionModel'], args.device, args.cpu_extension) Fl = FacialLandmarksDetectionModel(Dir['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) Ge = GazeEstimationModel(Dir['GazeEstimationModel'], args.device, args.cpu_extension) Hp = HeadPoseEstimationModel(Dir['HeadPoseEstimationModel'], args.device, args.cpu_extension) Mc = MouseController('medium', 'fast') ## Loading part starts here #start_model_load_time=time.time() inputFeeder.load_data() Fd.load_model() Fl.load_model() Hp.load_model() Ge.load_model() #total_model_load_time = time.time() - start_model_load_time count = 0 #start_inference_time=time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break count += 1 if count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = Fd.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue hp_out = Hp.predict(croppedFace.copy()) l_eye, r_eye, eye_coords = Fl.predict( croppedFace.copy()) # Main funcn's doing all our task new_coord, gaze_vector = Ge.predict(l_eye, r_eye, hp_out) #total_time=time.time()-start_inference_time #total_inference_time=round(total_time, 1) #fps=count/total_inference_time ## Now comes the importance of all the flags if (not len(Flags) == 0): new_frame = frame.copy() if 'fd' in Flags: new_frame = croppedFace if 'fl' in Flags: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in Flags: cv2.putText( new_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in Flags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(l_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(r_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re cv2.imshow("visualization", cv2.resize(new_frame, (500, 500))) if count % 5 == 0: Mc.move(new_coord[0], new_coord[1]) if key == 27: break logger.error("Video Done...") # print(total_inference_time) # print(fps) #print(total_model_load_time) cv2.destroyAllWindows() inputFeeder.close()
def main_benchmark(args): feed = InputFeeder(input_type=args.it, input_file=args.i) face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p)) start_time = time.time() face_model.load_model() face_load_model_time = time.time() - start_time landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c) start_time = time.time() landmarks_model.load_model() landmarks_model_time = time.time() - start_time headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c) start_time = time.time() headpose_model.load_model() headpose_model_time = time.time() - start_time gaze_model = GazeEstimationModel(args.gem, args.d, args.c) start_time = time.time() gaze_model.load_model() gaze_model_time = time.time() - start_time feed.load_data() for batch in feed.next_batch(): try: start_time = time.time() cropped_face, coords, face_time_prediction = face_model.predict( batch) cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 2) io_face_model_time = time.time() - start_time start_time = time.time() left_eye, right_eye, eyes_coords, landmarks_time_prediction = landmarks_model.predict( cropped_face) io_landmarks_model_time = time.time() - start_time start_time = time.time() head_pose_angles, headpose_time_prediction = headpose_model.predict( cropped_face) io_head_pose_model_time = time.time() - start_time start_time = time.time() x, y, z, gaze_time_prediction = gaze_model.predict( left_eye, right_eye, head_pose_angles, cropped_face, eyes_coords) io_gaze_model_time = time.time() - start_time print("Graphing loading time...") graph_loading_time(face_load_model_time, landmarks_model_time, headpose_model_time, gaze_model_time, args.bm) print("Graphing io processing time...") graph_io_processing_time(io_face_model_time, io_landmarks_model_time, io_head_pose_model_time, io_gaze_model_time, args.bm) print("Graphing inference time...") graph_model_inference_time(face_time_prediction, landmarks_time_prediction, headpose_time_prediction, gaze_time_prediction, args.bm) print("Done") break except: print("Frame without prediction. Error: ", sys.exc_info()[0]) log.error(sys.exc_info()[0]) feed.close()
def main(): args = build_argparser().parse_args() logging.basicConfig(filename=args.output+'/app.log', filemode='w') print("Begin: Try not to move mouse with your hands") mc = MouseController("low", "fast") if args.input == "cam": frames = InputFeeder("cam") else: frames = InputFeeder("video", args.input) cap = frames.load_data() if args.display: initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join(args.output, 'output_video.mp4'), cv2.VideoWriter_fourcc('m','p','4','v'), fps, (initial_w, initial_h)) face_model = FaceDetectionModel(args.face_model, args.output, args.device) pose_model = HeadPoseEstimationModel(args.pose_model, args.output, args.device) landmarks_model = FacialLandmarksDetectionModel(args.landmarks_model, args.output, args.device) gaze_model = GazeEstimationModel(args.gaze_model, args.output, args.device) avg_out = 0 avg = 0 tmlt_face_avg = 0 tinpt_face_avg = 0 tint_face_avg = 0 toutt_face_avg = 0 tmlt_pose_avg = 0 tinpt_pose_avg = 0 tint_pose_avg = 0 toutt_pose_avg = 0 tmlt_landmarks_avg = 0 tinpt_landmarks_avg = 0 tint_landmarks_avg = 0 toutt_landmarks_avg = 0 tmlt_gaze_avg = 0 tinpt_gaze_avg = 0 tint_gaze_avg = 0 toutt_gaze_avg = 0 logging.info("Frames starting") for frame in frames.next_batch(): if frame is None: logging.error("Frame: " + frame + "failed") continue output_image = frame.copy() cropped_faces, tmlt_face, tinpt_face, tint_face, toutt_face = face_model.predict(frame) try: largest_face = cropped_faces[0] for face in cropped_faces: if largest_face.size < face.size: largest_face = face pose, tmlt_pose, tinpt_pose, tint_pose, toutt_pose = pose_model.predict(largest_face) landmarks, tmlt_landmarks, tinpt_landmarks, tint_landmarks, toutt_landmarks = landmarks_model.predict(largest_face) gaze_vector, tmlt_gaze, tinpt_gaze, tint_gaze, toutt_gaze = gaze_model.predict(largest_face, landmarks, pose) except Exception as e: logging.error("Model inference failed: " + str(e)) # print(e) continue if args.display: output_image, xmin, ymin = face_model.draw_crop_outputs(output_image, args.display) output_image = gaze_model.display_eye_boxes(output_image, landmarks, xmin, ymin, args.display) out_video.write(output_image) cv2.imshow("output_image", output_image) cv2.waitKey(15) face_model.coords = [] tmlt_face_avg += tmlt_face tinpt_face_avg += tinpt_face tint_face_avg += tint_face toutt_face_avg += toutt_face tmlt_pose_avg += tmlt_pose tinpt_pose_avg += tinpt_pose tint_pose_avg += tint_pose toutt_pose_avg += toutt_pose tmlt_landmarks_avg += tmlt_landmarks tinpt_landmarks_avg+= tinpt_landmarks tint_landmarks_avg += tint_landmarks toutt_landmarks_avg += toutt_landmarks if gaze_vector is None: avg_out += 1 continue tmlt_gaze_avg += tmlt_gaze tinpt_gaze_avg += tinpt_gaze tint_gaze_avg += tint_gaze toutt_gaze_avg += toutt_gaze avg += 1 gaze_vector_norm = gaze_vector / np.linalg.norm(gaze_vector) try: mc.move(gaze_vector_norm[0], gaze_vector_norm[1]) except Exception as e: logging.error("Gaze failed: " + str(e)) # print(e) continue file_name = "stats_"+args.precision+".txt" save_path = os.path.join(os.getcwd(), args.output) f = open(os.path.join(save_path, file_name), "w") f.write("Benchmark Start:"+"\n\n") f.write("Face Detection Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_face_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_face_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_face_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_face_avg/avg)+"\n\n") f.write("Head Pose Estimation Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_pose_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_pose_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_pose_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_pose_avg/avg)+"\n\n") f.write("Facial Landmarks Detection Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_landmarks_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_landmarks_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_landmarks_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_landmarks_avg/avg)+"\n\n") f.write("Gaze Estimation Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_gaze_avg/(avg-avg_out))+"\n") f.write("Total Input Time:"+str(tinpt_gaze_avg/(avg-avg_out))+"\n") f.write("Total Inference Time:"+str(tint_gaze_avg/(avg-avg_out))+"\n") f.write("Total Output Time:"+str(toutt_gaze_avg/(avg-avg_out))+"\n\n") f.write("Benchmark end"+"\n") f.close() print("Thank you, Goodbye") frames.close()
def main(): # get command line args args = build_argparser().parse_args() logger = log.getLogger() type_input = args.input if type_input == "CAM": inputFeeder = InputFeeder("cam") else: inputFeeder = InputFeeder("video", args.input) inputFeeder.load_data() mc = MouseController("medium", "fast") fdm = FaceDetectionModel(model_name=args.face_dectection_model, device=args.device, extensions=args.cpu_extension, threshold=args.prob_threshold) fldm = FacialLandmarksModel(model_name=args.face_landmarks_model, device=args.device, extensions=args.cpu_extension) gem = GazeEstimationModel(model_name=args.gaze_estimation_model, device=args.device, extensions=args.cpu_extension) hpem = HeadPoseEstimationModel(model_name=args.head_pose_model, device=args.device, extensions=args.cpu_extension) data_capture = {} start_time = time.time() fdm.load_model() fdm_load_time = time.time() fldm.load_model() fldm_load_time = time.time() hpem.load_model() hpem_load_time = time.time() gem.load_model() gem_load_time = time.time() data_capture['FaceDetectionModel_loadtime'] = round( (fdm_load_time - start_time) * 1000, 3) data_capture['FacialLandmarksModel_loadtime'] = round( (fldm_load_time - fdm_load_time) * 1000, 3) data_capture['HeadPoseEstimationModel_loadtime'] = round( (hpem_load_time - fldm_load_time) * 1000, 3) data_capture['GazeEstimationModel_loadtime'] = round( (gem_load_time - hpem_load_time) * 1000, 3) for flag, frame in inputFeeder.next_batch(): if not flag: break pressedKey = cv2.waitKey(60) start_infer_time = time.time() # time to start inference face_coords, face_img = fdm.predict(frame) fdm_infertime = time.time() if face_coords == 0: # if face not detected continue hpem_out = hpem.predict(face_img) hpem_infertime = time.time() left_eye, right_eye, eye_coord = fldm.predict(face_img) fldm_infertime = time.time() if left_eye.all() == 0 or right_eye.all( ) == 0: # if eye are not detected continue mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hpem_out) gem_infertime = time.time() if args.preview: output_boxes(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3])) bound_boxes(frame, eye_coord, 45, 25, face_coords[0], face_coords[1]) text = "Yaw: {:.2f}, Pitch: {:+.2f}, Roll: {:.2f}".format( hpem_out[0], hpem_out[1], hpem_out[2]) output_text(frame, text, (100, 100)) h = frame.shape[0] w = frame.shape[1] center_of_face = (h / 2, w / 2, 0) draw_axes(frame, center_of_face, hpem_out[0], hpem_out[1], hpem_out[2], scale=50, focal_length=950) cv2.imshow('video', cv2.resize(frame, (500, 500))) cv2.imshow('video', cv2.resize(frame, (500, 500))) mc.move(mouse_coord[0], mouse_coord[1]) if pressedKey == 27: break data_capture['FaceDetectionModel_Inferencetime'] = round( (fdm_infertime - start_infer_time) * 1000, 3) data_capture['HeadPoseEstimationModel_Inferencetime'] = round( (hpem_infertime - fdm_infertime) * 1000, 3) data_capture['FacialLandmarksModel_Inferencetime'] = round( (fldm_infertime - hpem_infertime) * 1000, 3) data_capture['GazeEstimationModel_Inferencetime'] = round( (gem_infertime - fldm_infertime) * 1000, 3) total_time = round((time.time() - start_infer_time) * 1000, 3) data_capture['Total_time'] = total_time df = pd.DataFrame.from_dict(data_capture, orient='index', columns=['time(msecs)']) df.to_csv("results.csv") logger.error("Video has ended...") cv2.destroyAllWindows() inputFeeder.close()