def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ #if args.input == 'cam': # args.input = 0 output_intermediate_model = args.output_intermediate_model ### TODO: Handle the input stream ### feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) cap = feed.load_data() width = int(cap.get(3)) height = int(cap.get(4)) fps = int(cap.get(5)) # Initialise the class try: infer_network_face_detection = BasePointer() infer_network_head_pose_estimation = BasePointer() infer_network_landmarks_regression_retail = BasePointer() infer_network_gaze_estimation = GazeEstimation() except: logging.error("Error in initializing models") exit(1) ### TODO: Load the model through `infer_network_face_detection` ### try: start_loading_time_face_detection = time.time() infer_network_face_detection.load_model(args.model1, args.device) load_model_face_detection_time_taken = time.time( ) - start_loading_time_face_detection start_loading_time_head_pose_estimation = time.time() infer_network_head_pose_estimation.load_model(args.model2, args.device) load_model_head_pose_estimation_time_taken = time.time( ) - start_loading_time_head_pose_estimation start_loading_time_landmarks_regression_retail = time.time() infer_network_landmarks_regression_retail.load_model( args.model3, args.device) load_model_landmarks_regression_retail_time_taken = time.time( ) - start_loading_time_landmarks_regression_retail start_loading_time_gaze_estimation = time.time() infer_network_gaze_estimation.load_model(args.model4, args.device) load_model_gaze_estimation_time_taken = time.time( ) - start_loading_time_gaze_estimation except: logging.error("Error in loading the models") exit(1) logging.debug( "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(load_model_face_detection_time_taken, load_model_landmarks_regression_retail_time_taken, load_model_head_pose_estimation_time_taken, load_model_gaze_estimation_time_taken)) if output_intermediate_model == 'true': out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height)) total_time_taken_to_infer_inf_face_detection = 0 total_time_taken_to_infer_landmarks_regression_retail = 0 total_time_taken_to_infer_inf_head_pose_estimation = 0 total_time_taken_to_infer_gaze_estimation = 0 ### TODO: Loop until stream is over ### for batch in feed.next_batch(): ### TODO: Read from the video capture ### flag, frame = batch if not flag: break key_pressed = cv2.waitKey(60) ### TODO: Start inference for face detection ### start_inf_face_detection = time.time() outputs_face_detection = infer_network_face_detection.predict(frame) time_taken_to_infer_inf_face_detection = time.time( ) - start_inf_face_detection coords, frame = infer_network_face_detection.preprocess_output_face_detection( outputs_face_detection, width, height, args.prob_threshold, frame) if output_intermediate_model == 'true': out.write(frame) frame_crop_face = crop_face(coords, frame, output_intermediate_model) start_inf_head_pose_estimation = time.time() outputs_head_pose_estimation = infer_network_head_pose_estimation.predict( frame_crop_face) time_taken_to_infer_inf_head_pose_estimation = time.time( ) - start_inf_head_pose_estimation yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation( outputs_head_pose_estimation, frame_crop_face) head_pose_angles = [yaw, pitсh, roll] if output_intermediate_model == 'true': cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) height_crop_face = coords[0][3] - coords[0][1] width_crop_face = coords[0][2] - coords[0][0] start_inf_landmarks_regression_retail = time.time() outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict( frame_crop_face) time_taken_to_infer_landmarks_regression_retail = time.time( ) - start_inf_landmarks_regression_retail coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail( outputs_landmarks_regression_retail, width_crop_face, height_crop_face, args.prob_threshold, frame) center_left_eye = ((coords[0][0] + coord_landmarks_regression_retail[0]), coords[0][1] + coord_landmarks_regression_retail[1]) center_right_eye = ((coords[0][0] + coord_landmarks_regression_retail[2]), coords[0][1] + coord_landmarks_regression_retail[3]) xmin_left_eye = center_left_eye[0] - 30 ymin_left_eye = center_left_eye[1] - 30 xmax_left_eye = center_left_eye[0] + 30 ymax_left_eye = center_left_eye[1] + 30 xmin_right_eye = center_right_eye[0] - 30 ymin_right_eye = center_right_eye[1] - 30 xmax_right_eye = center_right_eye[0] + 30 ymax_right_eye = center_right_eye[1] + 30 frame_landmarks_regression_retail = cv2.circle(frame, center_left_eye, 2, (0, 255, 0), thickness=3) frame_landmarks_regression_retail = cv2.circle(frame, center_right_eye, 2, (0, 255, 0), thickness=3) box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye), (xmax_left_eye, ymax_left_eye), (0, 255, 0), 3) box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye), (xmax_right_eye, ymax_right_eye), (0, 255, 0), 3) if output_intermediate_model == 'true': out.write(frame_landmarks_regression_retail) ### TODO: Start inference for gaze estimation ### start_inf_gaze_estimation = time.time() outputs_gaze_estimation = infer_network_gaze_estimation.predict( box_left_eye, box_right_eye, head_pose_angles) time_taken_to_infer_gaze_estimation = time.time( ) - start_inf_gaze_estimation total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation arrow = 100 g_x = int(outputs_gaze_estimation[0] * arrow) g_y = int(-(outputs_gaze_estimation[1]) * arrow) frame = cv2.arrowedLine(frame, (center_left_eye), ((center_left_eye[0] + g_x), (center_left_eye[1] + g_y)), (0, 0, 255), 3) frame = cv2.arrowedLine(frame, (center_right_eye), ((center_right_eye[0] + g_x), (center_right_eye[1] + g_y)), (0, 0, 255), 3) if output_intermediate_model == 'true': out.write(frame) mouse_controler_pc = MouseController("high", "fast") mouse_controler_pc.move(outputs_gaze_estimation[0], outputs_gaze_estimation[1]) if key_pressed == 27: break feed.close() logging.debug( "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(total_time_taken_to_infer_inf_face_detection, total_time_taken_to_infer_landmarks_regression_retail, total_time_taken_to_infer_inf_head_pose_estimation, total_time_taken_to_infer_gaze_estimation)) if output_intermediate_model == 'true': out.release() #cap.release() cv2.destroyAllWindows()
def run(self, args): ''' Runs inference on specified input Args: args (Namespace): application arguments ''' listener = keyboard.Listener(on_release=self.on_release) listener.start() if args.debug is not '': self.debug = True if not args.silent and args.debug is '': print('Press \'esc\' to exit') mouseController = MouseController('high', 'fast') mouseController.center() inputFeeder = InputFeeder(args.input) logging.info('Loading models') start_loading = time.time() # ----- Models Load ------------------------------------------------------------ faceDetection = ModelFaceDetection() facialLanmarksDetection = ModelFacialLandmarksDetection( precision=args.precision) headPoseEstimation = ModelHeadPoseEstimation(device=args.device, precision=args.precision) gazeEstimation = ModelGazeEstimation(precision=args.precision) # ------------------------------------------------------------------------------ stop_loading = time.time() loading_time = stop_loading - start_loading pool = Pool(processes=1) # Must be called after Models Load logging.info('Starting inference') frame = None image = None inference_time = 0 counter = 0 while self.execute: try: frame = next(inputFeeder.next_batch()) except StopIteration: logging.error('Failed to obtain input stream.') break if frame is None: break start_inference = time.time() # ----- Inference -------------------------------------------------------------- faceDetection.inputs(frame) # GFlops 0.611 faceDetection.wait() outputs = faceDetection.outputs() if len(outputs) == 0: logging.warning('No face detected') continue if len(outputs) > 1: logging.warning('More then one face detected') if outputs[0].shape[0] == 0 or outputs[0].shape[1] == 0 or \ outputs[0].shape[2] < 3: logging.warning('Image too small') continue headPoseEstimation.inputs(outputs[0]) # GFlops 0.105 facialLanmarksDetection.inputs(outputs[0]) # GFlops 0.021 facialLanmarksDetection.wait() outputs = facialLanmarksDetection.outputs() if outputs[0].shape[0] < 60 or outputs[0].shape[1] < 60 or \ outputs[0].shape[2] < 3 or outputs[1].shape[0] < 60 or \ outputs[1].shape[1] < 60 or outputs[1].shape[2] < 3: logging.warning('Image too small') continue headPoseEstimation.wait() outputs.append(headPoseEstimation.outputs()) gazeEstimation.inputs(outputs) # GFlops 0.139 gazeEstimation.wait() outputs = gazeEstimation.outputs() # ------------------------------------------------------------------------------ stop_inference = time.time() result = pool.apply_async(mouseController.move,[outputs[0], \ outputs[1]]) inference_time = inference_time + stop_inference - start_inference counter = counter + 1 if '1' in args.debug: image = faceDetection.debug[0] if '2' in args.debug: self.axises(image, headPoseEstimation.debug) if '3' in args.debug: self.points(image, facialLanmarksDetection.debug) if '4' in args.debug: self.lines(image, gazeEstimation.debug) cv2.imshow('Debug Mode (Press \'esc\' to exit)', image) cv2.waitKey(50) if args.output is not None: image = faceDetection.debug[0] self.axises(image, headPoseEstimation.debug) self.points(image, facialLanmarksDetection.debug) self.lines(image, gazeEstimation.debug) inputFeeder.close() if args.output is not None: cv2.imwrite(args.output, image) if not args.silent: print('Total loading time of the models: ' + str(loading_time) + ' s') print('Average inference time: ' + str(inference_time / counter) + ' s') print('Frames per second: ' + str(counter / inference_time))
def infer_on_stream(args): # Set Probability threshold for detections prob_threshold = args.prob_threshold intermediatePreview = args.preview_flags face_detector_path = args.face_detector_model facial_landmark_path = args.facial_landmark_model head_pose_path = args.head_pose_estimation_model gaze_est_path = args.gaze_estimation_model device = args.device extension = args.cpu_extension input_type = args.type.lower() input_file = args.input speed = args.mouse_speed precision = args.mouse_prec # model classess intializing face_detector = FaceDetectionModel(model_name=face_detector_path, device=device, extensions=extension) face_landmark_detector = FacialLandmardDetectionModel( model_name=facial_landmark_path, device=device, extensions=extension) head_pose_estimation = HeadPoseEstimationModel(model_name=head_pose_path, device=device, extensions=extension) gaze_estimation = GazeEstimationModel(model_name=gaze_est_path, device=device, extensions=extension) log.info("Model loading...") # model loading model_loading = time.time() # inference pipeline face_detector.load_model() face_landmark_detector.load_model() head_pose_estimation.load_model() gaze_estimation.load_model() log.info("Models are loaded") log.info("Modal Loading Time: {:.3f}ms".format( (time.time() - model_loading) * 1000)) # visual pipeline try: input_feeder = InputFeeder(input_type, input_file) input_feeder.load_data() except: log.error("Something went wrong with loading camera/mouse") exit(0) mouse = MouseController(precision, speed) frames = 0 for ret, frame in input_feeder.next_batch(): if not ret: break frames += 1 key = cv2.waitKey(60) inf_start = time.time() face_coords, face_cropped_image = face_detector.predict( frame, prob_threshold) preview_image = face_cropped_image if (face_coords): if 'fl' in intermediatePreview: eye_coords, left_eye, right_eye, preview_image = face_landmark_detector.predict( face_cropped_image, True) else: eye_coords, left_eye, right_eye, preview_image = face_landmark_detector.predict( face_cropped_image) if 'hp' in intermediatePreview: head_pose_angles, preview_image = head_pose_estimation.predict( face_cropped_image, preview_image) else: head_pose_angles = head_pose_estimation.predict( face_cropped_image) if 'ge' in intermediatePreview: mouse_coord, gaze_coord, preview_image = gaze_estimation.predict( left_eye, right_eye, head_pose_angles, preview_image) else: mouse_coord, gaze_coord = gaze_estimation.predict( left_eye, right_eye, head_pose_angles) left_eye = (eye_coords[0][0] + 20, eye_coords[0][1] + 20) right_eye = (eye_coords[1][0] + 20, eye_coords[1][1] + 20) gaze_x = int(gaze_coord[0] * 250) gaze_y = int(-gaze_coord[1] * 250) if 'ge' in intermediatePreview: cv2.arrowedLine(preview_image, left_eye, (left_eye[0] + gaze_x, left_eye[1] + gaze_y), (0, 255, 0), 3) cv2.arrowedLine(preview_image, right_eye, (right_eye[0] + gaze_x, right_eye[1] + gaze_y), (0, 255, 0), 3) inference_time = time.time() - inf_start inf_time_message = "Inf Time Per Frame: {:.3f}ms"\ .format(inference_time * 1000) cv2.putText(preview_image, inf_time_message, (10, 10), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) cv2.imshow('frame', cv2.resize(preview_image, (400, 400))) if frames % 5 == 0: mouse.move(mouse_coord[0], mouse_coord[1]) input_feeder.close()
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) cv2.imshow('HP', preview_window3) if 'ge' in args.displayFlags: preview_window4 = cropped_face.copy() x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160 left_c = cv2.line(le.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) right_c = cv2.line(re.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) preview_window4[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = left_c preview_window4[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = right_c cv2.imshow('GE', preview_window4) fps = frame_count / inference_time logger.debug("Video ended.") print("Loading time: " + str(model_loading_time) + " s") print("Average inference time: " + str(inference_time / frame_count) + " s") print("FPS : ", format(fps / 5)) cv2.destroyAllWindows() inputFeeder.close()
def main(): # command line args args = build_argparser().parse_args() input_file_path = args.input log_object = log.getLogger() oneneneflags = args.visualization_flag # Initialise the classes fd_object = FaceDetection(model_name=args.face_detection_model, device=args.device, threshold=args.prob_threshold, extensions=args.cpu_extension) fl_object = FacialLandmarkDetection(model_name=args.facial_landmarks_model, device=args.device, extensions=args.cpu_extension) hp_object = HeadPoseEstimation(model_name=args.head_pose_model, device=args.device, extensions=args.cpu_extension) ge_object = GazeEstimation(model_name=args.gaze_estimation_model, device=args.device, extensions=args.cpu_extension) mouse_controller_object = MouseController('low', 'fast') ### Loading the models ### log_object.error( "=================== Models Load Time ====================") start_time = time.time() fd_object.load_model() log_object.error("Face detection model loaded in {:.3f} ms".format( (time.time() - start_time) * 1000)) fl_start = time.time() fl_object.load_model() log_object.error( "Facial landmarks detection model loaded in {:.3f} ms".format( (time.time() - fl_start) * 1000)) hp_start = time.time() hp_object.load_model() log_object.error("Head pose estimation model loaded in {:.3f} ms".format( (time.time() - hp_start) * 1000)) ge_start = time.time() ge_object.load_model() log_object.error("Gaze estimation model loaded in {:.3f} ms".format( (time.time() - ge_start) * 1000)) total_time = time.time() - start_time log_object.error( "=================== Models loaded successfully ===================") log_object.error("Total loading time is {:.3f} ms".format(total_time * 1000)) counter = 0 infer_start = time.time() log_object.error( "=================== Start inferencing on input video ====================" ) if input_file_path == "CAM": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): exit(1) input_feeder = InputFeeder("video", input_file_path) log_object.error("Input feeders are loaded") input_feeder.load_data() for frame in input_feeder.next_batch(): # if not flag: # break pressed_key = cv2.waitKey(60) counter += 1 face_coordinates, face_image = fd_object.predict(frame.copy()) if face_coordinates == 0: continue hp_output = hp_object.predict(face_image) left_eye_image, right_eye_image, eye_coord = fl_object.predict( face_image) mouse_coordinate, gaze_vector = ge_object.predict( left_eye_image, right_eye_image, hp_output) if len(oneneneflags) != 0: preview_window = frame.copy() if 'fd' in oneneneflags: if len(oneneneflags) != 1: preview_window = face_image else: cv2.rectangle(preview_window, (face_coordinates[0], face_coordinates[1]), (face_coordinates[2], face_coordinates[3]), (0, 150, 0), 3) if 'fl' in oneneneflags: if not 'fd' in oneneneflags: preview_window = face_image.copy() cv2.rectangle(preview_window, (eye_coord[0][0], eye_coord[0][1]), (eye_coord[0][2], eye_coord[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coord[1][0], eye_coord[1][1]), (eye_coord[1][2], eye_coord[1][3]), (150, 0, 150)) if 'hp' in oneneneflags: cv2.putText( preview_window, "yaw:{:.1f} | pitch:{:.1f} | roll:{:.1f}".format( hp_output[0], hp_output[1], hp_output[2]), (20, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) if 'ge' in oneneneflags: yaw = hp_output[0] pitch = hp_output[1] roll = hp_output[2] focal_length = 950.0 scale = 50 center_of_face = (face_image.shape[1] / 2, face_image.shape[0] / 2, 0) if 'fd' in oneneneflags or 'fl' in oneneneflags: draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) else: draw_axes(frame, center_of_face, yaw, pitch, roll, scale, focal_length) if len(oneneneflags) != 0: img_hor = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_window, (500, 500)))) else: img_hor = cv2.resize(frame, (500, 500)) cv2.imshow('Visualization', img_hor) mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1]) if pressed_key == 27: log_object.error("exit key is pressed..") break infer_time = round(time.time() - infer_start, 1) fps = int(counter) / infer_time log_object.error("counter {} seconds".format(counter)) log_object.error("total inference time {} seconds".format(infer_time)) log_object.error("fps {} frame/second".format(fps)) log_object.error("Video session has ended") with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write(str(infer_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_time) + '\n') input_feeder.close() cv2.destroyAllWindows()
def main(): # Grab command line args args = build_argparser().parse_args() inputFilePath = args.input inputFeeder = None if args.input == "CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video", args.input) modelPathDict = { 'FaceDetectionModel': args.face_detection_model, 'FacialLandmarksDetectionModel': args.facial_landmark_model, 'GazeEstimationModel': args.gaze_estimation_model, 'HeadPoseEstimationModel': args.head_pose_model } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified " + fileNameKey + " xml file") sys.exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: log.info("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) ''' if (not len(previewFlags)==0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0]-10, eye_coords[0][1]-10), (eye_coords[0][2]+10, eye_coords[0][3]+10), (0,255,0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0]-10, eye_coords[1][1]-10), (eye_coords[1][2]+10, eye_coords[1][3]+10), (0,255,0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText(preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(hp_out[0],hp_out[1],hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0]*12), int(gaze_vector[1]*12), 160 le =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(le, (x-w, y+w), (x+w, y-w), (255,0,255), 2) re = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(re, (x-w, y+w), (x+w, y-w), (255,0,255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3],eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3],eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace cv2.imshow("visualization",cv2.resize(preview_frame,(500,500))) ''' if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break log.info("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("debug.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) start_time = time.time() fdnet.load_model() logging.info( f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms") start_time = time.time() lmnet.load_model() logging.info( f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() hpnet.load_model() logging.info( f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() genet.load_model() logging.info( f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fd_output = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, bboxes = fdnet.preprocess_output( fd_output, frame, args.print) for bbox in bboxes: face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lm_output = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lm_output, bbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hp_output = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angles = hpnet.preprocess_output( hp_output, out_frame, face, bbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() ge_output = genet.predict(left_eye, right_eye, headpose_angles) ge_infertime += time.time() - start_time out_frame, gaze_vector = genet.preprocess_output( ge_output, out_frame, bbox, left_eye_point, right_eye_point, args.print) if not args.no_video: cv2.imshow('image', out_frame) if not args.no_move: mc.move(gaze_vector[0], gaze_vector[1]) break if key_pressed == 27: break if frame_count > 0: logging.info( f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms") logging.info( f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms" ) logging.info( f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms") logging.info( f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms") feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception(f"Error during inference:{str(ex)}")
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') logging.basicConfig(filename='example.log', level=logging.ERROR) init_model(args) # Initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input output_path = args.output_path prob_threshold = args.prob_threshold if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) feeder.load_data() w = int(feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(feeder.cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (w, h), True) frame_count = 0 for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: cropped_image, face_cords = face_model.predict( frame, prob_threshold) if type(cropped_image) == int: print("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_cords = landmark_model.predict( cropped_image) pose_output = head_pose_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_model.predict( left_eye, right_eye, pose_output) except Exception as e: print(str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (w, h)) if not len(preview_flags) == 0: preview_frame = frame.copy() const = 10 if 'ff' in preview_flags: if len(preview_flags) != 1: preview_frame = cropped_image cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (255, 0, 0), 3) if 'fl' in preview_flags: cv2.rectangle( cropped_image, (eye_cords[0][0] - const, eye_cords[0][1] - const), (eye_cords[0][2] + const, eye_cords[0][3] + const), (0, 255, 0), 2) cv2.rectangle( cropped_image, (eye_cords[1][0] - const, eye_cords[1][1] - const), (eye_cords[1][2] + const, eye_cords[1][3] + const), (0, 255, 0), 2) if 'fh' in preview_flags: cv2.putText( frame, "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}". format(pose_output[0], pose_output[1], pose_output[2]), (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 255), 2) if 'fg' in preview_flags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.arrowedLine(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.arrowedLine(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) preview_frame[eye_cords[0][1]:eye_cords[0][3], eye_cords[0][0]:eye_cords[0][2]] = le preview_frame[eye_cords[1][1]:eye_cords[1][3], eye_cords[1][0]:eye_cords[1][2]] = re image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if frame_count % 5 == 0: mouse_controller.move(mouse_cord[0], mouse_cord[1]) if key == 27: break logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ # --- INPUT --- # Initialize the input_type input_type = None # Check if the input is a webcam if args.input == 'CAM': input_type = 'cam' # Check if the input is an image elif args.input.endswith(('.jpg', '.bmp', '.png')): input_type = 'image' # Check if the input is a video elif args.input.endswith(('.mp4', '.avi')): input_type = 'video' else: sys.exit( f"[ ERRO ] The format of the input file '{args.input.endswith}' is not supported." ) # Initialize the InputFeeder input_feeder = InputFeeder(input_type, args.input) input_feeder.load_data() # --- MODELS --- # Load the Face Detection Model face_detection_model = FaceDetectionModel( model_xml_path=args.model_face_detection, device=args.device, extensions_path=args.cpu_extension, ) face_detection_model.load_model() # Load the Head Pose Estimation Model head_pose_estimation_model = HeadPoseEstimationModel( model_xml_path=args.model_head_pose, device=args.device, extensions_path=args.cpu_extension, ) head_pose_estimation_model.load_model() # Load the Facial Landmarks Detection Model facial_landmarks_detection_model = FacialLandmarksDetectionModel( model_xml_path=args.model_face_landmark, device=args.device, extensions_path=args.cpu_extension, ) facial_landmarks_detection_model.load_model() # Load the Gaze Estimation Model gaze_estimation_model = GazeEstimationModel( model_xml_path=args.model_gaze_estimation, device=args.device, extensions_path=args.cpu_extension, ) gaze_estimation_model.load_model() # --- POINTER CONTROLLER --- pointer_controller = MouseController( precision='medium', speed='medium', ) # --- WINDOW --- # Set the window to fullscreen # cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) # cv2.setWindowProperty(WINDOW_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) # Initialize list to track the inference time list_inference_time = [] #Loop until stream is over for frame in input_feeder.next_batch(): # If there is no frame break the loop if frame is None: break # start the timer start_time = time.time() # Initialize the frame to be displayed display_frame = frame # --- DETECT HEAD --- # Detect the head on the frame list_heads = face_detection_model.predict(frame) # Draw the outputs of the head detection algorithm if args.display_outputs: display_frame = face_detection_model.display_output( frame, list_heads) # --- HEAD POSE ESTIMATION --- # Extract the roi of the head with the highest confidence score head = list_heads[0] head_x_max = head.x + head.w head_y_max = head.y + head.h head_roi = frame[head.y:head_y_max, head.x:head_x_max, :] # Estimate the pose of the best head head_angles = head_pose_estimation_model.predict(head_roi) # Draw the pose of the best head if args.display_outputs: display_head_pose = head_pose_estimation_model.display_output( head_roi, head_angles) display_frame[head.y:head_y_max, head.x:head_x_max, :] = display_head_pose # --- FACIAL LANDMARKS DETECTION --- # Detect the facial landmarks on the head with the highest confidence score face_landmarks = facial_landmarks_detection_model.predict(head_roi) # Draw the facial landmarks of the best head if args.display_outputs: # Set display_name to True to display the name of the landmarks display_facial_landmarks = facial_landmarks_detection_model.display_output( display_head_pose, face_landmarks, display_name=True) display_frame[head.y:head_y_max, head.x:head_x_max, :] = display_facial_landmarks # --- GAZE ESTIMATION --- # Calculate the eye ROI size eye_roi_size = int(head_roi.shape[1] / 3) # Extract the roi of the left eyes left_eye_roi, left_eye_bbox = extract_landmark_roi( name='left_eye', landmarks=face_landmarks, roi_size=eye_roi_size, image=frame, origin_x=head.x, origin_y=head.y, ) # Extract the roi of the Rigth eyes right_eye_roi, right_eye_bbox = extract_landmark_roi( name='right_eye', landmarks=face_landmarks, roi_size=eye_roi_size, image=frame, origin_x=head.x, origin_y=head.y, ) # Predict the gaze gaze_vector = gaze_estimation_model.predict( left_eye_image=left_eye_roi, right_eye_image=right_eye_roi, head_angles=head_angles, ) # normalize the gaze vector based on the left eye left_eye_x_center = left_eye_bbox.x + int(left_eye_bbox.w / 2) left_eye_y_center = left_eye_bbox.y + int(left_eye_bbox.h / 2) start_vector = np.array([left_eye_x_center, left_eye_y_center, 0]) end_vector = np.array([ left_eye_x_center + gaze_vector.x, left_eye_y_center - gaze_vector.y, 0 + gaze_vector.z ]) vector = end_vector - start_vector norm_gaze_vector = vector / np.sqrt(np.dot(vector, vector)) # Draw the gaze output and the eyes ROI if args.display_outputs: # draw the bbox around each eyes display_frame = face_detection_model.display_output( display_frame, [left_eye_bbox, right_eye_bbox], color=(255, 255, 255), display_conf=False, ) # draw the gaze from both eyes display_frame = gaze_estimation_model.display_output( display_frame, norm_gaze_vector, [left_eye_bbox, right_eye_bbox], ) # Update position of the Computer Pointer if not args.disable_pointer_controller: pointer_controller.move(gaze_vector.x, gaze_vector.y) # Calculate the inference time stop_time = time.time() list_inference_time.append(stop_time - start_time) # Calculate and print the FPS fps = round(1 / (stop_time - start_time), 2) cv2.rectangle(display_frame, (10, 2), (120, 20), (255, 255, 255), -1) cv2.putText(display_frame, f"{fps} FPS", (15, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) # Display the frame cv2.imshow(WINDOW_NAME, display_frame) # Wait for 'ESC' or 'q' to exit the program keyboard = cv2.waitKey(30) if keyboard == 'q' or keyboard == 27: break # Release the input feeder input_feeder.close() # Destroy any OpenCV windows cv2.destroyAllWindows() # Display the average inference time and fps average_fps = round(1 / (mean(list_inference_time)), 2) print( f"[ INFO ] Average inference time was {mean(list_inference_time)}s ({average_fps} FPS)." ) print(f"[ INFO ] Successfully exited the program.")
def main(args): logger = logging.getLogger() feeder = None if args.input_type == constants.VIDEO or args.input_type == constants.IMAGE: extension = str(args.input).split('.')[1] # if not extension.lower() in constants.ALLOWED_EXTENSIONS: # logger.error('Please provide supported extension.' + str(constants.ALLOWED_EXTENSIONS)) # exit(1) # if not os.path.isfile(args.input): # logger.error("Unable to find specified video/image file") # exit(1) feeder = InputFeeder(args.input_type, args.input) elif args.input_type == constants.IP_CAMERA: if not str(args.input).startswith('http://'): logger.error('Please provide ip of server with http://') exit(1) feeder = InputFeeder(args.input_type, args.input) elif args.input_type == constants.WEBCAM: feeder = InputFeeder(args.input_type) mc = MouseController("medium", "fast") feeder.load_data() face_model = Face_Model(args.face, args.device, args.cpu_extension) face_model.check_model() landmark_model = Landmark_Model(args.landmarks, args.device, args.cpu_extension) landmark_model.check_model() # gaze_model = Gaze_Estimation_Model(args.gazeestimation, args.device, args.cpu_extension) # gaze_model.check_model() head_model = Head_Pose_Model(args.headpose, args.device, args.cpu_extension) head_model.check_model() face_model.load_model() logger.info("Face Detection Model Loaded...") landmark_model.load_model() logger.info("Landmark Detection Model Loaded...") # gaze_model.load_model() # logger.info("Gaze Estimation Model Loaded...") head_model.load_model() logger.info("Head Pose Detection Model Loaded...") print('Loaded') try: frame_count = 0 for ret, frame in feeder.next_batch(): if not ret: break if frame is None: continue frame_count += 1 crop_face = None if True: crop_face, box = face_model.predict(frame.copy()) if crop_face is None: logger.error("Unable to detect the face.") continue imshow('frame', crop_face, width=400) (lefteye_x, lefteye_y), ( righteye_x, righteye_y ), eye_coords, left_eye, right_eye = landmark_model.predict( crop_face.copy(), eye_surrounding_area=15) # imshow("left_eye", left_eye, width=100) # imshow("right_eye", right_eye, width=100) '''TODO dlib is better to crop eye with perfection''' head_position = head_model.predict(crop_face.copy()) if True: if cv2.waitKey(20) & 0xFF == ord('q'): break continue gaze, (mousex, mousey) = gaze_model.predict(left_eye.copy(), right_eye.copy(), head_position) if (len(args.debug) > 0): debuFrame = frame.copy() if crop_face is None: continue thickness = 2 radius = 2 color = (0, 0, 255) [[le_xmin, le_ymin, le_xmax, le_ymax], [re_xmin, re_ymin, re_xmax, re_ymax]] = eye_coords if 'face' in args.debug: cv2.rectangle(debuFrame, (box[0], box[1]), (box[2], box[3]), (255, 255, 255), 2) cv2.rectangle(crop_face, (re_xmin, re_ymin), (re_xmax, re_ymax), (100, 255, 100), 2) cv2.rectangle(crop_face, (le_xmin, le_ymin), (le_xmax, le_ymax), (100, 255, 100), 2) ''' LandMark ''' cv2.circle(crop_face, (lefteye_x, lefteye_y), radius, color, thickness) cv2.circle(crop_face, (righteye_x, righteye_y), radius, color, thickness) debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face if 'headpose' in args.debug: yaw = head_position[0] pitch = head_position[1] roll = head_position[2] sinY = math.sin(yaw * math.pi / 180.0) sinP = math.sin(pitch * math.pi / 180.0) sinR = math.sin(roll * math.pi / 180.0) cosY = math.cos(yaw * math.pi / 180.0) cosP = math.cos(pitch * math.pi / 180.0) cosR = math.cos(roll * math.pi / 180.0) cH, cW = crop_face.shape[:2] arrowLength = 0.4 * cH * cW xCenter = int(cW / 2) yCenter = int(cH / 2) # center to right # cv2.line(crop_face, (xCenter, yCenter), # (int((xCenter + arrowLength * (cosR * cosY + sinY * sinP * sinR))), # int((yCenter + arrowLength * cosP * sinR))), (186, 204, 2), 1) # # # center to top # cv2.line(crop_face, (xCenter, yCenter), # (int(((xCenter + arrowLength * (cosR * sinY * sinP + cosY * sinR)))), # int((yCenter - arrowLength * cosP * cosR))), (186, 204, 2), 1) # # # center to forward # cv2.line(crop_face, (xCenter, yCenter), # (int(((xCenter + arrowLength * sinY * cosP))), # int((yCenter + arrowLength * sinP))), (186, 204, 2), 1) # cv2.putText( crop_face, 'head pose: (y={:.2f}, p={:.2f}, r={:.2f})'.format( yaw, pitch, roll), (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1) if 'gaze' in args.debug: cH, cW = crop_face.shape[:2] arrowLength = 0.6 * cH gazeArrowX = gaze[0] * arrowLength gazeArrowY = -gaze[1] * arrowLength debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face cv2.arrowedLine(crop_face, (lefteye_x, lefteye_y), (int(lefteye_x + gazeArrowX), int(lefteye_y + gazeArrowY)), (184, 113, 57), 2) cv2.arrowedLine(crop_face, (righteye_x, righteye_y), (int(righteye_x + gazeArrowX), int(righteye_y + gazeArrowY)), (184, 113, 57), 2) cv2.putText(crop_face, 'gaze angles: h={}, v={}'.format("!", "2"), (0, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1) debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face # # imshow("face", crop_face, width=400) # cv2.moveWindow("face", 0, 0) # imshow("debug", debuFrame, width=400) # cv2.moveWindow("debug", cW * 2, cH) # try: # if frame_count % 5 == 0: # mc.move(mousex, mousey) # except Exception as err: # logger.error("Moving cursor outside the PC not supported yet !!") # key = cv2.waitKey(60) imshow('frame', debuFrame, width=1210) if cv2.waitKey(20) & 0xFF == ord('q'): break except Exception as err: logger.error(err) cv2.destroyAllWindows() feeder.close()
def main_benchmark(args): feed = InputFeeder(input_type=args.it, input_file=args.i) face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p)) start_time = time.time() face_model.load_model() face_load_model_time = time.time() - start_time landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c) start_time = time.time() landmarks_model.load_model() landmarks_model_time = time.time() - start_time headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c) start_time = time.time() headpose_model.load_model() headpose_model_time = time.time() - start_time gaze_model = GazeEstimationModel(args.gem, args.d, args.c) start_time = time.time() gaze_model.load_model() gaze_model_time = time.time() - start_time feed.load_data() for batch in feed.next_batch(): try: start_time = time.time() cropped_face, coords, face_time_prediction = face_model.predict( batch) cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 2) io_face_model_time = time.time() - start_time start_time = time.time() left_eye, right_eye, eyes_coords, landmarks_time_prediction = landmarks_model.predict( cropped_face) io_landmarks_model_time = time.time() - start_time start_time = time.time() head_pose_angles, headpose_time_prediction = headpose_model.predict( cropped_face) io_head_pose_model_time = time.time() - start_time start_time = time.time() x, y, z, gaze_time_prediction = gaze_model.predict( left_eye, right_eye, head_pose_angles, cropped_face, eyes_coords) io_gaze_model_time = time.time() - start_time print("Graphing loading time...") graph_loading_time(face_load_model_time, landmarks_model_time, headpose_model_time, gaze_model_time, args.bm) print("Graphing io processing time...") graph_io_processing_time(io_face_model_time, io_landmarks_model_time, io_head_pose_model_time, io_gaze_model_time, args.bm) print("Graphing inference time...") graph_model_inference_time(face_time_prediction, landmarks_time_prediction, headpose_time_prediction, gaze_time_prediction, args.bm) print("Done") break except: print("Frame without prediction. Error: ", sys.exc_info()[0]) log.error(sys.exc_info()[0]) feed.close()
def main(): # command line arguments args = build_argparser().parse_args() input_filename = args.input log_object = log.getLogger() visual_flags = args.visualization_flag print("Visual flags:",visual_flags) device_models=args.device print("deviceModels:",device_models) device_list = device_models.split(",") print("deviceList:",device_list) print("deviceFirst:",device_list[1]) str_cam ="cam" output_path = args.output_path if input_filename.lower() == str_cam: input_feeder = InputFeeder(str_cam) else: if not os.path.isfile(input_filename): log_object.error("Error: Can not find the video o image file.") exit(1) input_feeder = InputFeeder("video", input_filename) obj_face_detection = Face_detection_model(model_name=args.face_detection_model,device=device_list[0], threshold=args.prob_threshold, extensions=args.cpu_extension) obj_facial_landmarks = Facial_landmarks_detection_model(model_name=args.facial_landmarks_model,device=device_list[1], extensions=args.cpu_extension) obj_gaze_estimation = Gaze_estimation_model(model_name=args.gaze_estimation_model, device=device_list[2], extensions=args.cpu_extension) obj_head_pose_estimation = Head_pose_estimation_model(model_name=args.head_pose_model, device=device_list[3] , extensions=args.cpu_extension) mouse_controller_object = MouseController('medium', 'fast') start_time = time.time() obj_face_detection.load_model() model_load_time_face = time.time() - start_time start_landmark_time = time.time() obj_facial_landmarks.load_model() model_load_time_landmarks = time.time() - start_landmark_time start_headpose_time = time.time() obj_head_pose_estimation.load_model() model_load_time_headpose = time.time() - start_headpose_time start_gaze_time = time.time() obj_gaze_estimation.load_model() model_load_time_gaze = time.time() - start_gaze_time models_load_time = time.time() - start_time log_object.info("Info:Models loading time(face, landmark, gaze, head_pose): {:.3f} ms".format(models_load_time * 1000)) input_feeder.load_data() counter = 0 start_inference_time = time.time() log_object.info("Info:Start inferencing ") print(input_feeder.next_batch()) for ret,frame in input_feeder.next_batch(): #print(flag) #print(frame) if not ret: break pressed_key = cv2.waitKey(60) counter = counter + 1 print("counter:",counter) first_coords, image_change = obj_face_detection.predict(frame) inference_face_time = round(time.time() - start_inference_time, 1) print("Inference face time:",inference_face_time) left_eye_img, right_eye_img, eye_coord = obj_facial_landmarks.predict(image_change) inference_landmark_time = round(time.time() - start_inference_time, 1) print("Inference landmark time:",inference_landmark_time) if first_coords == 0: continue output_head_pose_estimation = obj_head_pose_estimation.predict(image_change) inference_head_time = round(time.time() - start_inference_time, 1) print("Inference inference_head_time:",inference_head_time) mouse_coordinate, gaze_vector = obj_gaze_estimation.predict(left_eye_img, right_eye_img, output_head_pose_estimation) inference_gaze_time = round(time.time() - start_inference_time, 1) print("Inference inference_gaze_time:",inference_gaze_time) frame_image = frame.copy() if len(visual_flags) != 0: preview=process_visual_flags(frame_image,visual_flags,frame,image_change,first_coords ,eye_coord,output_head_pose_estimation) else: preview = frame_image fps_face = int(counter) / inference_gaze_time color =(0,255,0) cv2.putText(frame_image,"Inference: = {:.2f}".format(inference_gaze_time),(20, 180),cv2.FONT_HERSHEY_COMPLEX,1, color, 2) mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1]) cv2.putText(frame_image,"FPS: = {:.2f}".format(fps_face),(20, 220),cv2.FONT_HERSHEY_COMPLEX,1, color, 2) image_new = cv2.resize(preview, (700, 700)) cv2.imshow('Visualization', image_new) mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1]) if pressed_key == 27: log_object.error("exit key is pressed..") break #Time calculations for every model. #inference_facefinal_time = inference_face_time #inference_landmarkfinal_time = inference_landmark_time - inference_face_time #inference_headfinal_time = inference_head_time - inference_landmark_time #inference_gazefinal_time = inference_gaze_time - inference_head_time inference_total_time = round(time.time() - start_inference_time, 1) print("Inference inference_total_time:",inference_total_time) #fps_face = int(counter) / inference_face_time #fps_landmark = int(counter) / inference_landmark_time #fps_head = int(counter) / inference_head_time #fps_gaze = int(counter) / inference_gaze_time fps_total = int(counter) / inference_total_time print("fps_total:",fps_total) with open(output_path+'statstotal.txt', 'w') as f: f.write(str(inference_total_time) + '\n') f.write(str(fps_total) + '\n') f.write(str(models_load_time) + '\n') with open(output_path+'statsmodels.txt', 'w') as f: # f.write(str(inference_facefinal_time)+ ','+str(inference_landmarkfinal_time)+','+str(inference_headfinal_time)+','+str(inference_gazefinal_time)+ '\n') # f.write(str(fps_face)+ ','+str(fps_landmark)+','+str(fps_head)+','+str(fps_gaze)+ '\n') # f.write(str(model_load_time_face)+ ','+str(model_load_time_landmarks)+','+str(model_load_time_headpose)+','+str(model_load_time_gaze)+ '\n') f.write(str(model_load_time_face) + '\n') f.write(str(model_load_time_landmarks) + '\n') f.write(str(model_load_time_headpose) + '\n') f.write(str(model_load_time_gaze) + '\n') log_object.info("Info:Finishing Video") input_feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() frameNum = 0 inferenceTime = 0 counter = 0 # Initialize the Inference Engine fd = FaceDetection() ld = Facial_Landmarks_Detection() ge = gazeEstimation() hp = headPose() modelStart = time.time() # Load Models fd.loadModel(args.faceDetectionModel, args.device) ld.loadModel(args.faceLandmarkModel, args.device) ge.loadModel(args.gazeEstimationModel, args.device) hp.loadModel(args.headPoseModel, args.device) print("Model Load timing:", (time.time() - modelStart) * 1000, "ms") # Get the input feeder if args.input == "cam": feed = InputFeeder("cam") else: assert os.path.isfile(args.input), "Specified input file doesn't exist" feed = InputFeeder("video", args.input) feed.load_data() frameCount = 0 # Mouse Controller precision and speed mc = MouseController('medium', 'fast') for frame in feed.next_batch(): frameCount += 1 if frame is not None: key = cv2.waitKey(60) inferenceStart = time.time() # make predictions detected_face, faceCoords = fd.predict(frame.copy(), args.prob_threshold) hpOutput = hp.predict(detected_face.copy()) leftEye, rightEye, eyeCoords = ld.predict(detected_face.copy()) new_mouse_coord, gazeVector = ge.predict(leftEye, rightEye, hpOutput) inferenceTime = time.time() - inferenceStart counter = counter + 1 # Visualization preview = args.visualization if preview: preview_frame = frame.copy() faceFrame = detected_face.copy() drawFaceBoundingBox(preview_frame, faceCoords) displayHp(preview_frame, hpOutput, faceCoords) draw_landmarks(faceFrame, eyeCoords) draw_gaze(faceFrame, gazeVector, leftEye.copy(), rightEye.copy(), eyeCoords) if preview: img = np.hstack((cv2.resize(preview_frame, (500, 500)), cv2.resize(faceFrame, (500, 500)))) else: img = cv2.resize(frame, (500, 500)) cv2.imshow('Visualization', img) # set speed if frameCount % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) print("Frame Number:", frameNum) print("Inference Time:", inferenceTime * 1000) frameNum += 1 if key == 27: break feed.close()
def main(): args = build_argparser().parse_args() visual = args.visual_flag log = logging.getLogger() input_source = args.input_source try: video_path = args.input_path except Exception as e: video_path = None feed = None if input_source.lower() == 'cam': feed = InputFeeder('cam') elif input_source.lower() == 'video' and os.path.isfile(video_path): feed = InputFeeder('video', video_path) else: log.error('Wrong input feed. (check the video path).') exit(1) fd = Model_Face(args.face_detection_model, args.device, args.extension) hp = Model_HeadPose(args.head_pose_model, args.device, args.extension) fl = Model_Faciallandmark(args.facial_landmarks_model, args.device, args.extension) ga = Model_Gaze(args.gaze_model, args.device, args.extension) ### You can specify the value of precision and speed directly. ## OR ## 'high'(100),'low'(1000),'medium','low-med' - precision ## 'fast'(1), 'slow'(10), 'medium', 'slow-med' - speed # mouse = MouseController('low-med', 'slow-med') mouse = MouseController(500, 4) feed.load_data() # load models fd.load_model() hp.load_model() fl.load_model() ga.load_model() count = 0 for ret, frame in feed.next_batch(): if not ret: break count += 1 if count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) frame_cp = frame.copy() face, face_position = fd.predict(frame_cp, args.threshold) if type(face) == int: log.error('Prediction Error: Cant find face.') if key == 27: break continue face_cp = face.copy() hp_output = hp.predict(face_cp) left_eye, right_eye, facial = fl.predict(face_cp) # print('left',left_eye,'\n','right',right_eye,'\n') mouse_coord, gaze_vector = ga.predict(left_eye, right_eye, hp_output) if (not len(visual) == 0): visual_frame = frame.copy() ### Visual FLAGS # face detection if 'fd' in visual: visual_frame = face # Head pose if 'hp' in visual: cv2.putText( visual_frame, "Yaw: {:.2f} Pitch: {:.2f} Roll: {:.2f}".format( hp_output[0], hp_output[1], hp_output[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.3, (0, 255, 50), 1) # Facial landmarks if 'fl' in visual: cv2.rectangle(face, (facial[0][0] - 10, facial[0][1] - 10), (facial[0][2] + 10, facial[0][3] + 10), (255, 0, 0), 3) cv2.rectangle(face, (facial[1][0] - 10, facial[1][1] - 10), (facial[1][2] + 10, facial[1][3] + 10), (255, 0, 0), 3) # Gaze estimation if 'ga' in visual: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 255, 0), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 255, 0), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2) face[facial[0][1]:facial[0][3], facial[0][0]:facial[0][2]] = le face[facial[1][1]:facial[1][3], facial[1][0]:facial[1][2]] = re cv2.namedWindow('Visualization', cv2.WINDOW_AUTOSIZE) cv2.moveWindow('Visualization', 900, 900) cv2.imshow('Visualization', cv2.resize(visual_frame, (500, 500))) # if count%10==0: # cv2.imwrite(str(count)+'_visual.jpg',visual_frame) if count % 5 == 0: mouse.move(mouse_coord[0], mouse_coord[1]) if key == 27: break log.error('INFO: Ended!') cv2.destroyAllWindows() feed.close()
class Pipeline: def __init__(self, args): self.log_level = "INFO" if os.environ.get( "LOGLEVEL") == "INFO" or args.verbose_stage else "WARNING" log.basicConfig(level=self.log_level) input_type = 'cam' if args.cam else 'video' self.feed = InputFeeder(input_type, args.video) if not self.feed.load_data(): raise Exception('Input valid image or video file') fps, w, h = self.feed.get_props() self.out_video = cv2.VideoWriter(args.out, cv2.VideoWriter_fourcc(*'MJPG'), fps, (w, h), True) args.head_pose_model = os.path.join( args.head_pose_model, args.precision, os.path.basename(args.head_pose_model)) args.landmarks_model = os.path.join( args.landmarks_model, args.precision, os.path.basename(args.landmarks_model)) args.gaze_model = os.path.join(args.gaze_model, args.precision, os.path.basename(args.gaze_model)) self.fd = FaceDetect(args.face_model, args.device, args.extension, args.threshold) self.fd.load_model() self.fd.set_out_size(w, h) self.hp = HeadPoseEstimate(args.head_pose_model, args.device, args.extension, args.threshold) self.hp.load_model() self.fl = FacialLandMarkDetect(args.landmarks_model, args.device, args.extension, args.threshold) self.fl.load_model() self.gz = GazeEstimate(args.gaze_model, args.device, args.extension, args.threshold) self.gz.load_model() self.mc = MouseController() self.verbose_stage = args.verbose_stage def get_bounding_rect(self, x, y): width, height = 40, 20 x1, y1 = x - int(width / 2), y - int(height / 2) x2, y2 = x + int(width / 2), y + int(height / 2) return x1, y1, x2, y2 def verbose_stage_draw(self, frame, face_coord, eye_coord, head_pose_angles, mouse_coord): f_x1, f_y1, f_x2, f_y2 = face_coord self.fd.draw_rect(frame, (f_x1, f_y1), (f_x2, f_y2)) e_x1, e_y1, e_x2, e_y2 = eye_coord left_x, left_y, right_x, right_y = self.get_bounding_rect(e_x1, e_y1) self.fl.draw_rect(frame, (f_x1 + left_x, f_y1 + left_y), (f_x1 + right_x, f_y1 + right_y)) left_x, left_y, right_x, right_y = self.get_bounding_rect(e_x2, e_y2) self.fl.draw_rect(frame, (f_x1 + left_x, f_y1 + left_y), (f_x1 + right_x, f_y1 + right_y)) text = "Yaw: {:+.0f}, Pitch: {:+.0f}, Roll: {:+.0f}".format( *head_pose_angles) self.hp.draw_text(frame, text, (100, 100)) self.gz.draw_circle(frame, mouse_coord, 10) def run(self): abs_mouse_x = abs_mouse_y = 0 for frame in self.feed.next_batch(): f_x1, f_y1, f_x2, f_y2 = self.fd.predict(frame) face_frame = frame[f_y1:f_y2, f_x1:f_x2] if not face_frame.size: # skip if face not detected continue head_pose_angles = self.hp.predict(face_frame) self.fl.set_out_size(f_x2 - f_x1, f_y2 - f_y1) e_x1, e_y1, e_x2, e_y2 = self.fl.predict(face_frame) left_x, left_y, right_x, right_y = self.get_bounding_rect( e_x1, e_y1) left_eye_frame = face_frame[left_y:right_y, left_x:right_x] left_x, left_y, right_x, right_y = self.get_bounding_rect( e_x2, e_y2) right_eye_frame = face_frame[left_y:right_y, left_x:right_x] if not left_eye_frame.size or not right_eye_frame.size: # skip if eyes not detected continue g_x, g_y, _ = self.gz.predict(left_eye_frame, right_eye_frame, [[*head_pose_angles]]) self.mc.move(g_x, g_y) if self.verbose_stage: _, w, h = self.feed.get_props() if abs_mouse_x == 0 and abs_mouse_y == 0: abs_mouse_x = int(f_x1 + (e_x1 + e_x2) / 2) abs_mouse_y = int(f_y1 + (e_y1 + e_y2) / 2) else: abs_mouse_x += int(g_x * w / 250) abs_mouse_y -= int(g_y * h / 250) self.verbose_stage_draw(frame, (f_x1, f_y1, f_x2, f_y2), (e_x1, e_y1, e_x2, e_y2), head_pose_angles, (abs_mouse_x, abs_mouse_y)) self.out_video.write(frame) def close(self): self.feed.close() self.out_video.release()
def main(args): print("Main script running...") log_name = 'stats_' + args.device + '_' + args.hpe + args.fld + args.ge if not os.path.exists('output'): os.makedirs('output') print(f"Logging to: output/{log_name}") log = open('output/' + log_name, 'w+') print("Initializing models...") fd = FaceDetector( model_name= 'models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001', device=args.device, extensions=None) fd.load_model() if args.v: print(f"Face Detection Load Time: {fd.load_time}") hpe = HeadPoseEstimator( model_name= f'models/intel/head-pose-estimation-adas-0001/{args.hpe}/head-pose-estimation-adas-0001', device=args.device, extensions=None) hpe.load_model() if args.v: print(f"Head Pose Estimation Load Time: {hpe.load_time}") fld = FacialLandmarkDetector( model_name= f'models/intel/landmarks-regression-retail-0009/{args.fld}/landmarks-regression-retail-0009', device=args.device, extensions=None) fld.load_model() if args.v: print(f"Facial Landmarks Detection Load Time: {fld.load_time}") ge = GazeEstimator( model_name= f'models/intel/gaze-estimation-adas-0002/{args.ge}/gaze-estimation-adas-0002', device=args.device, extensions=None) ge.load_model() if args.v: print(f"Gaze Estimation Load Time: {ge.load_time}") image = False print("Initializing source feed...") feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) if args.input_type == 'image': image = True feed.load_data() for batch in feed.next_batch(): if args.v: print() cv2.imshow('Batch', batch) if image: cv2.imwrite('output/Batch.png', batch) coords, bounding_face = fd.predict(batch) if not coords: print("No face") continue if image: cv2.imwrite('output/Face.png', bounding_face) box = coords[0] face = bounding_face[box[1]:box[3], box[0]:box[2]] if args.v: print(f"Face Time: {fd.infer_time}") log.write("FD_infer: " + str(fd.infer_time) + "\n") if image: cv2.imshow('Cropped Face', face) # Landmark Detection coords, landmark_detection, landmark_points = fld.predict(face) if image: cv2.imwrite('output/Landmarks.png', landmark_detection) if image: cv2.imshow('Landmark Detection', landmark_detection) if args.v: print(f"Landmark Time: {fld.infer_time}") log.write("FLD_infer: " + str(fld.infer_time) + "\n") right_box, left_box = coords[0:2] if args.v: print(f"Eye Coords: {coords}") if left_box == None or right_box == None: print("No eyes") continue left_eye = face[left_box[1]:left_box[3], left_box[0]:left_box[2]] cv2.putText(face, 'L', (left_box[0], left_box[3]), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2) right_eye = face[right_box[1]:right_box[3], right_box[0]:right_box[2]] cv2.putText(face, 'R', (right_box[0], right_box[3]), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2) if args.v: print(f"Eye Shape: {left_eye.shape} :: {right_eye.shape}") #Head Pose Estimation head_yaw, head_pitch, head_roll = hpe.predict(face) if args.v: print(f"Head Pose Time: {hpe.infer_time}") log.write("HPE_infer: " + str(hpe.infer_time) + "\n") head_angles = [head_yaw[0][0], head_pitch[0][0], head_roll[0][0]] #Gaze Estimation # expects pose as (yaw, pitch, and roll) gaze = ge.predict(left_eye, right_eye, head_angles) if args.v: print(f"Gaze Time: {ge.infer_time}") log.write("GE_infer: " + str(ge.infer_time) + "\n") gaze_point = (int(gaze[0][0] * 50), int(gaze[0][1] * 50)) arrows = cv2.arrowedLine(face, landmark_points[0], (landmark_points[0][0] + gaze_point[0], landmark_points[0][1] - gaze_point[1]), (0, 0, 255), 2) arrows = cv2.arrowedLine(face, landmark_points[1], (landmark_points[1][0] + gaze_point[0], landmark_points[1][1] - gaze_point[1]), (0, 0, 255), 2) if image: cv2.imwrite('output/Gaze.png', arrows) if not image: mouse = MouseController(precision='medium', speed='medium') mouse.move(gaze[0][0], gaze[0][1]) if image: cv2.imshow('Arrows', arrows) if image: log.write("FD_LoadTime: " + str(fd.load_time) + "\n") log.write("FD_PreprocessTime: " + str(fd.preprocess_input_time) + "\n") log.write("FD_PostrocessTime: " + str(fd.preprocess_output_time) + "\n") log.write("FLD_LoadTime: " + str(fld.load_time) + "\n") log.write("FLD_PreprocessTime: " + str(fld.preprocess_input_time) + "\n") log.write("FLD_PostprocessTime: " + str(fld.preprocess_output_time) + "\n") log.write("HPE_LoadTime: " + str(hpe.load_time) + "\n") log.write("HPE_PreprocessTime: " + str(hpe.preprocess_input_time) + "\n") log.write("GE_LoadTime: " + str(ge.load_time) + "\n") log.write("GE_PreprocessTime: " + str(ge.preprocess_input_time) + "\n") cv2.waitKey(0) else: if cv2.waitKey(15) & 0xFF == ord('q'): break feed.close() log.close() cv2.destroyAllWindows
def infer_on_stream(args): models = None # Check selected precision model if "FP32" in args.precision: models = select_precision(args.precision) if "FP16" in args.precision: models = select_precision(args.precision) if "INT8" in args.precision: models = select_precision(args.precision) # Get Input input_feeder = InputFeeder(args.input_type, args.input_file) input_feeder.load_data() # Load face detection model face = FaceDetection(model_name=models[0], device=args.device, extensions=args.cpu_extension) face.load_model() # Load head pose model head = HeadPoseEstimation(model_name=models[1], device=args.device, extensions=args.cpu_extension) head.load_model() # Load facial landmark model landmark = FacialLandmarkDetection(model_name=models[2], device=args.device, extensions=args.cpu_extension) landmark.load_model() # Load gaze estimation model gaze = GazeEstimation(model_name=models[3], device=args.device, extensions=args.cpu_extension) gaze.load_model() # Initalize mouse controller mouse = MouseController('high', 'fast') for frame in input_feeder.next_batch(): # Break if number of next frame less then number of batch if frame is None: break # Estimate face region output_frame, cropped_face, box_coord = face.predict(frame) # Estimate head pose position head_pose = head.predict(cropped_face) head_pose = np.array(head_pose) # Estimate eyes landmark coordinates lr_eyes = landmark.predict(cropped_face) eyes = [] # Calculate eye image region for coord in lr_eyes: x = int(coord[0] + box_coord[0]) y = int(coord[1] + box_coord[1]) cv2.circle(output_frame, (x, y), 5, (255, 0, 0), -1) eye_box, cropped_eye = eyes_crop(output_frame, x, y, 40) cv2.rectangle(output_frame, eye_box[0], eye_box[1], (255, 0, 0), 1) eyes.append(cropped_eye) # Estimate gaze direction gaze_coords = gaze.predict(eyes[0], eyes[1], head_pose) # Move the mouse cursor mouse.move(gaze_coords[0], gaze_coords[1]) if "True" in args.visualize: cv2.imshow('Capture', output_frame) if cv2.waitKey(30) & 0xFF == ord('q'): break input_feeder.close() if "True" in args.visualize: cv2.destroyAllWindows()
def main(args): # get all arguments model_face=args.model_face model_landmark=args.model_landmark model_pose=args.model_pose model_gaze=args.model_gaze device=args.device extensions=args.extensions video_file=args.video output_path=args.output_path face_confidence=args.threshold_face_detection precision=args.mouse_precision speed=args.mouse_speed show_frame=args.show_frame show_log=args.debug # set up logging if show_log: logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s:%(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.ERROR) logger = logging.getLogger('Main') # initialize models print('Initializing models') start = time.time() face_detector= ModelFaceDetection(model_name=model_face, device=device, extensions=extensions, threshold=face_confidence) face_detector.load_model() print ('...Successfully loading face detection model in {:.2f} ms'.format(time.time() -start)) start = time.time() landmark_detector= ModelLandmarksDetection(model_name=model_landmark) landmark_detector.load_model() print ('...Successfully loading landmarks detection model in {:.2f} ms'.format(time.time() -start)) start = time.time() pose_estimator=ModelHeadPoseEstimation(model_name=model_pose) pose_estimator.load_model() print ('...Successfully loading head pose estimation model in {:.2f} ms'.format(time.time() -start)) start = time.time() gaze_estimator=ModelGazeEstimation(model_name=model_gaze) gaze_estimator.load_model() print ('...Successfully loading gaze estimation model in {:.2f} ms'.format(time.time() -start)) # get input print('Getting input data') input_type = 'video' if video_file=='cam': input_type = 'cam' logger.info('Using camera') elif not support_video_format(video_file): print ('Unsupported input format! Please use only video file or cam as input') exit(1) logger.info('Using video input from ', video_file) feed=InputFeeder(input_type=input_type, input_file=video_file) feed.load_data() initial_w = int(feed.getCap().get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(feed.getCap().get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(feed.getCap().get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(feed.getCap().get(cv2.CAP_PROP_FPS)) if output_path: out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (initial_w, initial_h), True) print('...Video size hxw= {}x{}'.format(initial_h, initial_w)) # mouse controller print('Initializing mouse controller') if precision in ['high', 'low', 'medium'] and speed in ['fast', 'slow', 'medium']: center = (initial_w/2, initial_h/2) mouse_controller=MouseController(precision, speed, center) else: print('Please setup mouse precision and speed correctly!') exit(1) count = 0 print('Looping through all the frame and doing inference') for batch in feed.next_batch(): count = count + 1 logger.info('Frame#{}'.format(count)) logger.info('Detecting face') face, coord, image = face_detector.predict(batch) if face is None: print('...There might be no face or more than 1 face detected. Skip this frame') continue logger.info('Successfully detecting 1 face') logger.info('Estimating head pose') pose, image = pose_estimator.predict(face.copy(), image) logger.info('Detecting facial landmarks') eyes, eyes_center, image = landmark_detector.predict(face.copy(), coord, image) logger.info('Estimating gaze') gaze, image = gaze_estimator.predict(eyes[0], eyes[1], pose, eyes_center, image) logger.info('Gaze vector (x,y,z)= ({},{},{})'.format(gaze[0][0], gaze[0][1], gaze[0][2])) if output_path: logger.info('Writing output frame to file') out_video.write(image) if show_frame and (count % 5==0): # show intermediate result every 5 frames cv2.imshow('frame'.format(count), image) # Press Q on keyboard to stop if cv2.waitKey(1) & 0xFF == ord('q'): break #TODO comment the following to deactivate mouse movement #if want to focus more on the intermediate result! if count%10==0: # pyautogui.moveRel blocking 0.1s -> blocking inference -> move only every 10 frames mouse_controller.move(gaze[0][0], gaze[0][1]) if output_path: print('Finished inference and successfully stored output to ', os.path.join(output_path, 'output_video.mp4')) else: print('Finished inference') running_time_report(face_detector.get_time(), landmark_detector.get_time(), pose_estimator.get_time(), gaze_estimator.get_time()) print('Releasing resources') if output_path: out_video.release() feed.close() cv2.destroyAllWindows()
def infer_on_stream(args): network_fd = Face_Detection(args.face_detection_model, args.device) network_hp = Head_Pose_Estimation(args.head_pose_model, args.device) network_fl = Facial_Landmarks_Detection(args.facial_landmarks_model, args.device) network_ge = Gaze_Estimation(args.gaze_estimation_model, args.device) mouse_cont = MouseController(args.mouse_precision, args.mouse_speed) starting_loading = time.time() network_fd.load_model() network_hp.load_model() network_fl.load_model() network_ge.load_model() duration_loading = time.time() - starting_loading input_type = handle_input(args.input) feed = InputFeeder(input_type=input_type, input_file=args.input) feed.load_data() starting_inference = time.time() for flag, frame in feed.next_batch(): if not flag: break key_pressed = cv2.waitKey(60) out_frame, face, face_coords = network_fd.predict( frame, args.prob_threshold, args.display) if len(face_coords) == 0: log.error("There is no face in the stream!") continue out_frame, head_angle = network_hp.predict(out_frame, face, face_coords, args.display) out_frame, eye_left, eye_right, eye_center = network_fl.predict( out_frame, face, face_coords, args.display) out_frame, gaze = network_ge.predict(out_frame, eye_left, eye_right, eye_center, head_angle, args.display) mouse_cont.move(gaze[0], gaze[1]) if key_pressed == 27: break cv2.imshow('Visualization', cv2.resize(out_frame, (600, 400))) duration_inference = time.time() - starting_inference print("Total loading time is: {}\nTotal inference time is: {} ".format( duration_loading, duration_inference)) feed.close() cv2.destroyAllWindows
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') is_benchmarking = False # initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) # instantiate model face_detection_model = FaceDetectionModel( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_detection_model = LandmarkDetectionModel( model_path_dict['LandmarkRegressionModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = HeadPoseEstimationModel( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = GazeEstimationModel( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) # load Models start_model_load_time = time.time() face_detection_model.load_model() landmark_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1920, 1080), True) frame_count = 0 gaze_vectors = [] start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: face_cords, cropped_image = face_detection_model.predict(frame) if type(cropped_image) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict( cropped_image) pose_output = head_pose_estimation_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) gaze_vectors.append(gaze_vector) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (500, 500)) if not len(preview_flags) == 0: preview_frame = draw_preview(frame, preview_flags, cropped_image, left_eye_image, right_eye_image, face_cords, eye_cords, pose_output, gaze_vector) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if key == 27: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time gaze_df = pd.DataFrame(gaze_vectors, columns=['vector_x', 'vector_y', 'vector_z']) gaze_df.to_csv("gaze_vectors_excercise_video.csv", index=False) logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close() """
def main(): #Building the arguments args = build_parser().parse_args() previewFlag = args.previewFlags log = logging.getLogger() input_path = args.input inputFeed = None if input_path.lower() == 'cam': inputFeed = InputFeeder('cam') else: if not os.path.isfile(input_path): log.error("Unable to find the input file specified.") exit(1) inputFeed = InputFeeder('video', input_path) #Creating Model paths model_path = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } for fnameKey in model_path.keys(): if not os.path.isfile(model_path[fnameKey]): log.error('Unable to find the specified ' + fnameKey + 'binary file(.xml)') exit(1) #Creating Model Instances fd = FaceDetection(model_path['FaceDetectionModel'], args.device, args.cpu_extension) flm = FacialLandmarkDetection(model_path['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gm = GazeEstimation(model_path['GazeEstimationModel'], args.device, args.cpu_extension) hpe = Head_Pose_estimation(model_path['HeadPoseEstimationModel'], args.device, args.cpu_extension) m_control = MouseController('medium', 'fast') #Loading data inputFeed.load_data() fd.load_model() flm.load_model() hpe.load_model() gm.load_model() frame_count = 0 for ret, frame in inputFeed.next_batch(): if not ret: break frame_count += 1 if frame_count % 10 == 0: cv2.imshow('Original Video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) coords, img = fd.predict(frame, args.prob_threshold) if type(img) == int: log.error("No face detected") if key == 27: break continue hpout = hpe.predict(img) left_eye, right_eye, eye_coord = flm.predict(img) mouse_coord, gaze_vec = gm.predict(left_eye, right_eye, hpout) if (not len(previewFlag) == 0): preview_img = img if 'fd' in previewFlag: preview_img = img if 'fld' in previewFlag: start_l = (eye_coord[0][0] - 10, eye_coord[0][1] - 10) end_l = (eye_coord[0][2] + 10, eye_coord[0][3] + 10) start_r = (eye_coord[1][0] - 10, eye_coord[1][1] - 10) end_r = (eye_coord[1][2] + 10, eye_coord[1][3] + 10) cv2.rectangle(img, start_l, end_l, (0, 255, 0), 2) cv2.rectangle(img, start_r, end_r, (0, 255, 0), 2) if 'hp' in previewFlag: cv2.putText( preview_img, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hpout[0], hpout[1], hpout[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (255, 255, 255), 1) if 'ge' in previewFlag: x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160 lefteye = cv2.line(left_eye, (x - w, y - w), (x + w, y + w), (100, 0, 255), 1) cv2.line(lefteye, (x - w, y + w), (x + w, y - w), (100, 0, 255), 1) righteye = cv2.line(right_eye, (x - w, y - w), (x + w, y + w), (100, 0, 255), 1) cv2.line(righteye, (x - w, y + w), (x + w, y - w), (100, 0, 255), 1) img[eye_coord[0][1]:eye_coord[0][3], eye_coord[0][0]:eye_coord[0][2]] = lefteye img[eye_coord[1][1]:eye_coord[1][3], eye_coord[1][0]:eye_coord[1][2]] = righteye cv2.imshow("Detections", cv2.resize(preview_img, (500, 500))) if frame_count % 10 == 0: m_control.move(mouse_coord[0], mouse_coord[1]) if key == 27: break log.error("Videostream Completed") cv2.destroyAllWindows() inputFeed.close()
def main(): args = build_argparser().parse_args() Flags_ = args.Flags logger = logging.getLogger() inputFilePath = args.input_model inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) FDM, FLDM, GEM, HPEM = model_assigner(args, logger) mc = MouseController('medium', 'fast') inputFeeder.load_data() FDM.load_model() FLDM.load_model() HPEM.load_model() GEM.load_model() frame_count = 0 logger.info(inputFeeder) for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = FDM.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue hp_out = HPEM.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = FLDM.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = GEM.predict(left_eye, right_eye, hp_out) if (not len(Flags_) == 0): preview_frame = frame.copy() if 'fd' in Flags_: preview_frame = croppedFace if 'fld' in Flags_: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in Flags_: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in Flags_: cv2.putText( frame, "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( gaze_vector[0], gaze_vector[1], gaze_vector[2]), (20, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2) x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close()
def main(): args = build_argparser().parse_args() Flags = args.Flags logger = logging.getLogger() inputFilePath = args.input inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) Dir = { 'facedetection': args.facedetectionmodel, 'facelandmarksdetection': args.faciallandmarkmodel, 'Gaze': args.gazeestimationmodel, 'head_pose': args.headposemodel } for fileKey in Dir.keys(): if not os.path.isfile(Dir[fileKey]): logger.error("Unable to find " + fileKey + " xml file") exit(1) Fd = facedetection(Dir['facedetection'], args.device, args.cpu_extension) Fl = facelandmarksdetection(Dir['facelandmarksdetection'], args.device, args.cpu_extension) Ge = Gaze(Dir['Gaze'], args.device, args.cpu_extension) Hp = head_pose(Dir['head_pose'], args.device, args.cpu_extension) Mc = MouseController('medium', 'fast') #loading start_model_load_time = time.time() inputFeeder.load_data() Fd.load_model() Fl.load_model() Hp.load_model() Ge.load_model() total_model_load_time = time.time() - start_model_load_time count = 0 start_inference_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break count += 1 if count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = Fd.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("unsupported layers, could not detect face") if key == 27: break continue hp_out = Hp.predict(croppedFace.copy()) l_coords, r_coords, coords = Fl.predict(croppedFace.copy()) new_coord, Gaze_vec = Ge.predict(l_coords, r_coords, hp_out) total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = count / total_inference_time if (not len(Flags) == 0): new_frame = frame.copy() if 'fd' in Flags: new_frame = croppedFace if 'fl' in Flags: cv2.rectangle(croppedFace, (coords[0][0] - 10, coords[0][1] - 10), (coords[0][2] + 10, coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (coords[1][0] - 10, coords[1][1] - 10), (coords[1][2] + 10, coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in Flags: cv2.putText( new_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in Flags: x, y, w = int(Gaze_vec[0] * 12), int(Gaze_vec[1] * 12), 160 le = cv2.line(l_coords.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(r_coords.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[coords[0][1]:coords[0][3], coords[0][0]:coords[0][2]] = le croppedFace[coords[1][1]:coords[1][3], coords[1][0]:coords[1][2]] = re cv2.imshow("visualization", cv2.resize(new_frame, (500, 500))) if count % 5 == 0: Mc.move(new_coord[0], new_coord[1]) if key == 27: break logger.error("Video Done...") print(total_inference_time) print(fps) print(total_model_load_time) cv2.destroyAllWindows() inputFeeder.close()
def main(): arg_parser = ArgParser() args = arg_parser.get_args() input_file = args.input # If input file defined then use it else use the webcam if input_file: if not os.path.isfile(input_file): log.error("Input file cannot be found") exit() input_feeder = InputFeeder("video", input_file) else: input_feeder = InputFeeder("cam") face_detection_model = FaceDetection(args.face_detection_model, args.device, args.extensions) face_detection_model.load_model() facial_landmarks_model = FacialLandmarksDetection( args.facial_landmark_detection_model, args.device, args.extensions) facial_landmarks_model.load_model() gaze_model = GazeEstimation(args.gaze_estimation_model, args.device, args.extensions) gaze_model.load_model() head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model, args.device, args.extensions) head_pose_model.load_model() mouse_controller = MouseController('medium', 'fast') input_feeder.load_data() frame_count = 0 total_face_detection_inference_time = 0 total_facial_landmark_inference_time = 0 total_head_pose_inference_time = 0 total_gaze_estimation_inference_time = 0 total_inference_time = 0 for ret, frame in input_feeder.next_batch(): if not ret: log.error("ret variable not found") break frame_count += 1 if frame_count % args.mouse_update_interval == 0: cv2.imshow('Input', frame) key_pressed = cv2.waitKey(60) # Run inference on the face detection model start_time = time.time() cropped_face, face_coordinates = face_detection_model.predict( frame.copy(), args.probability_threshold) finish_time = time.time() total_face_detection_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # If no face detected get the next frame if len(face_coordinates) == 0: continue # Run inference on the facial landmark detection model start_time = time.time() results = facial_landmarks_model.predict(cropped_face.copy()) finish_time = time.time() left_eye_coordinates = results[0] right_eye_coordinates = results[1] left_eye_image = results[2] right_eye_image = results[3] left_eye_crop_coordinates = results[4] right_eye_crop_coordinates = results[5] total_facial_landmark_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the head pose estimation model start_time = time.time() head_pose = head_pose_model.predict(cropped_face.copy()) finish_time = time.time() total_head_pose_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the gaze estimation model start_time = time.time() new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict( left_eye_image, right_eye_image, head_pose) finish_time = time.time() total_gaze_estimation_inference_time += finish_time - start_time total_inference_time += finish_time - start_time if frame_count % args.mouse_update_interval == 0: log.info("Mouse controller new coordinates: x = {}, y = {}".format( new_mouse_x_coordinate, new_mouse_y_coordinate)) mouse_controller.move(new_mouse_x_coordinate, new_mouse_y_coordinate) # Optional visualization configuration: if args.show_detected_face: showDetectedFace(frame, face_coordinates) if args.show_head_pose: showHeadPose(frame, head_pose) if args.show_facial_landmarks: showFacialLandmarks(cropped_face, left_eye_crop_coordinates, right_eye_crop_coordinates) if args.show_gaze_estimation: showGazeEstimation(frame, right_eye_coordinates, left_eye_coordinates, gaze_vector, cropped_face, face_coordinates) # Break if escape key pressed if key_pressed == 27: log.warning("Keyboard interrupt triggered") break # Release the capture and destroy any OpenCV windows cv2.destroyAllWindows() input_feeder.close() log.info("Average face detection inference time: {} seconds".format( total_face_detection_inference_time / frame_count)) log.info( "Average facial landmark detection inference time: {} seconds".format( total_facial_landmark_inference_time / frame_count)) log.info("Average head pose estimation inference time: {} seconds".format( total_head_pose_inference_time / frame_count)) log.info("Average gaze estimation inference time: {} seconds".format( total_gaze_estimation_inference_time / frame_count)) log.info("Average total inference time: {} seconds".format( total_inference_time / frame_count))
def main(): # Grab command line args args = build_args().parse_args() # Config Logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger() #os.system('clear') print("\n") logger.info("starting app ...") print("\n==========<COMPUTER POINTER CONTROLLER>==========") print("============>(c) Ibrahim Ishaka 2020<============\n") # initialize model object for each class FDModel = FaceDetectionModel(model=args.face_detection_model, device=args.device, extensions=args.extension, threshold=args.prob_threshold) FLDModel = FacialLandmarksDetectionModel(model=args.facial_landmark_model, device=args.device, extensions=args.extension) HPEModel = HeadPoseEstimationModel(model=args.head_pose_model, device=args.device, extensions=args.extension) GEModel = GazeEstimationModel(model=args.gaze_estimation_model, device=args.device, extensions=args.extension) models = {'fd': FDModel, 'fl': FLDModel, 'hp': HPEModel, 'ge': GEModel} models_loading_time = 0 for k in models: # load model logger.info("Loading {} Model".format(models[k].model_name)) model_loading_start = time.time() models[k].load_model() model_loading_finish = (time.time() - model_loading_start) models_loading_time = models_loading_time + model_loading_finish logger.info("time taken to load Model: {:.3f}secs".format( model_loading_finish)) # check if model output visualization is specified in sh arg if k in args.show_output or args.show_output == 'all': models[k].show = True logger.info("show {} outputs: {} \n".format(models[k].model_name, models[k].show)) logger.info("time taken to load All Models: {:.3f}secs\n".format( models_loading_time)) # setting for mouse controller _precision = "medium" _speed = "fast" mouse_controller = MouseController(precision=_precision, speed=_speed) # verify and handle input stream input_source = args.input input_feeder = None input_type = "" if input_source.lower() != "cam": # check if input file exist if os.path.exists(input_source) and os.path.isfile(input_source): image_formats = [".png", ".jpg", ".bmp", ".jpeg"] is_image = [ True for x in image_formats if input_source.endswith(x) ] if is_image: input_type = "image" else: input_type = "video" input_feeder = InputFeeder(input_type=input_type, input_file=input_source) else: logger.error("Input file is not a file, or does't exist") sys.exit(1) elif input_source.lower() == "cam": input_type = "cam" input_feeder = InputFeeder(input_type=input_type) input_feeder.load_data() frame_count = 0 total_inference_time_all = 0 window_closed = False for flag, frame in input_feeder.next_batch(): if flag is False: # no frame to read break frame_count = frame_count + 1 key_pressed = cv2.waitKey(60) if input_source == 'cam': # preprocess frame as webcam is backwards/inverted frame = cv2.flip(frame, 1) face_detection_result = FDModel.predict(frame) # The prediction result should return None, if no face detected if face_detection_result is None: if not window_closed: cv2.imshow(input_type, cv2.resize(frame, (500, 500))) logger.info("NO FACE DETECTED... skipping") continue cropped_face = face_detection_result[0] face_coords = face_detection_result[1] hp_result = HPEModel.predict(cropped_face) left_eye, right_eye = FLDModel.predict(cropped_face) new_mouse_coords, gaze_vector = GEModel.predict( left_eye, right_eye, hp_result) total_inference_time = 0 for key in models: total_inference_time = total_inference_time + models[ key].inference_time total_inference_time_all = total_inference_time_all + total_inference_time #uncomment the following line to see the inference time for each frame #logger.info("Inference Time : {:.3f}".format(total_inference_time)) try: x, y = new_mouse_coords except: logger.error( "unable to get mouse coordinates for current frame\nReading Next Frame..." ) continue if GEModel.show == True: GEModel.show_gaze(left_eye, right_eye, gaze_vector) if HPEModel.show == True: frame = HPEModel.show_hp(frame, hp_result) if new_mouse_coords is None: # Error during LR_eyes processing continue ''' wait on before moving mouse again this is recomended to avoid failsafe exception but you change this setting ''' if input_type == "image": cv2.imshow(input_type, cv2.resize(frame, (500, 500))) mouse_controller.move(x, y) break if frame_count % 5 == 0: try: logger.info("changing mouse position... moving") mouse_controller.move(x, y) except pyautogui.FailSafeException: logger.error("safe exception From pyautogui") continue if not window_closed: cv2.imshow(input_type, cv2.resize(frame, (500, 500))) # Break if escape key pressed if key_pressed == 27: break # close the OpenCV window if q key pressed if key_pressed == ord('q'): window_closed = True cv2.destroyWindow(input_type) logger.info(input_type + " window closed... to exit app, press CTRL+Z") if frame_count != 0: # Release the capture and destroy any OpenCV window input_feeder.close() cv2.destroyAllWindows() logger.info("Stream ended !") fps = round(frame_count / total_inference_time_all, 2) print("\n==========SUMMARY===========") print("models loading time : ", round(models_loading_time, 2)) print("frames per seconds : ", fps) print("total inference time : ", round(total_inference_time_all, 2)) print("============================") else: logger.error("Unable to handle Unsupported file ") sys.exit(1)
def infer_on_stream(args): start_model_load_time=time.time() #initiate and load models face_det_net = Face_Detection_Model(args.face_model) face_det_net.load_model() head_pose_net = Head_Pose_Model(args.head_model) head_pose_net.load_model() facial_landmarks_net = Facial_Landmarks_Model(args.landmarks_model) facial_landmarks_net.load_model() gaze_est_net = Gaze_Estimation_Model(args.gaze_model) gaze_est_net.load_model() total_model_load_time = time.time() - start_model_load_time #initiate stream counter=0 start_inference_time=time.time() if args.input.lower()=="cam": frame_feeder = InputFeeder(input_type='cam') frame_feeder.load_data() else: frame_feeder = InputFeeder(input_type='video', input_file=args.input) frame_feeder.load_data() fps = frame_feeder.get_fps() log.info('Video started') #initiate mouse controller mouse_controller = MouseController('medium','fast') ## write output video in Winows out_video = cv2.VideoWriter('../output.mp4',cv2.VideoWriter_fourcc(*'avc1'), fps,(frame_feeder.get_size()), True) ## write output video in Linux #out_video = cv2.VideoWriter('output.mp4',cv2.VideoWriter_fourcc(*'avc1'), #fps,(frame_feeder.get_size())) for flag,frame in frame_feeder.next_batch(): if flag == True: key = cv2.waitKey(60) counter+=1 coords, image, face = face_det_net.predict(frame) pose = head_pose_net.predict(face) land, left_eye_image, right_eye_image, eye_coords = facial_landmarks_net.predict(face) if left_eye_image.shape == (40, 40, 3): mouse_coords, gaze = gaze_est_net.predict(left_eye_image, right_eye_image, pose) mouse_controller.move(mouse_coords[0], mouse_coords[1]) if args.visual.lower()=="yes": frame = draw_outputs(coords, eye_coords, pose, gaze, mouse_coords[0], mouse_coords[1], image) cv2.imshow('video', frame) out_video.write(frame) cv2.imshow('video', frame) else: cv2.imshow('video', frame) if key == 27: break else: log.info('Video ended') total_time=time.time()-start_inference_time total_inference_time=round(total_time, 1) f_ps=counter/total_inference_time log.info("Models load time {:.2f}.".format(total_model_load_time)) log.info("Total inference time {:.2f}.".format(total_inference_time)) log.info("Inference frames pre second {:.2f}.".format(f_ps)) cv2.destroyAllWindows() frame_feeder.close() break
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') is_benchmarking = False total_score = 0 # initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path # add path for exercise video data exercise_video_path = '../bin/demo.mp4' exercise_gaze_path = '../bin/demo.csv' exercise_gaze_df = pd.read_csv(exercise_gaze_path) if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) exercise_feeder = InputFeeder(input_type='video', input_file=exercise_video_path) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) # instantiate model face_detection_model = FaceDetectionModel( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_detection_model = LandmarkDetectionModel( model_path_dict['LandmarkRegressionModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = HeadPoseEstimationModel( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = GazeEstimationModel( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) # load Models start_model_load_time = time.time() face_detection_model.load_model() landmark_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() exercise_feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1000, 500), True) frame_count = 0 gaze_vectors = [] start_inference_time = time.time() for ret, frame in feeder.next_batch(): # flip the image to make it similar to video image frame = np.flip(frame, 1) ex_ret, ex_frame = next(exercise_feeder.next_batch()) if not ret: break # This will stop the cam when exercise video is over if len(exercise_gaze_df) <= len(gaze_vectors): break frame_count += 1 key = cv2.waitKey(60) try: face_cords, cropped_image = face_detection_model.predict(frame) if type(cropped_image) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict( cropped_image) pose_output = head_pose_estimation_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) gaze_vectors.append(gaze_vector) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue if not len(preview_flags) == 0: preview_frame = draw_preview(frame, 'ff', cropped_image, left_eye_image, right_eye_image, face_cords, eye_cords, pose_output, gaze_vector) cropped_image = np.hstack((cv2.resize(ex_frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) instructor_gaze_vector = exercise_gaze_df.iloc[frame_count - 1].values score = cosine(instructor_gaze_vector, gaze_vector) if score > 0.1: total_score += 1 # show score on output video cv2.putText( ex_frame, "Instructor Gaze Vector: {} ".format(instructor_gaze_vector), (40, 60), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2) cv2.putText(ex_frame, "User Gaze Vector: {}".format(gaze_vector), (40, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2) cv2.putText(ex_frame, "Gaze Match Score : {}".format(total_score), (40, 145), cv2.FONT_HERSHEY_COMPLEX, 1.5, (0, 0, 0), 2) ex_frame = cv2.rectangle(ex_frame, (20, 20), (1200, 160), (0, 0, 0), 2) image = np.hstack( (cv2.resize(ex_frame, (500, 500)), cv2.resize(cropped_image, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if key == 0: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time if input_filename == "cam": filename = "cam.csv" else: filename = input_filename.split("/")[-1].split(".")[0] + ".csv" gaze_df = pd.DataFrame(gaze_vectors, columns=['vector_x', 'vector_y', 'vector_z']) gaze_df.to_csv(filename, index=False) logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close() """
def main(): args =arg_parser().parse_args() input_file = args.input visual = args.visualization if input_file == "cam": input_feeder = InputFeeder("cam") elif input_file == "image": input_feeder = InputFeeder("image", input_file) elif not input_file: log.error("Input file not found") exit(1) else: input_feeder = InputFeeder("video", input_file) face_d = Face_Detector("../models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001.xml", "../models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001.bin", args.device, args.extension) face_l = Face_Landmark_Detector("../models/intel/landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml", "../models/intel/landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin", args.device, args.extension) gaze = Gaze_Estimator("../models/intel/gaze-estimation-adas-0002/FP32/gaze-estimation-adas-0002.xml", "../models/intel/gaze-estimation-adas-0002/FP32/gaze-estimation-adas-0002.bin", args.device, args.extension) head = Head_Pose_Estimator("../models/intel/head-pose-estimation-adas-0001/FP32/head-pose-estimation-adas-0001.xml", "../models/intel/head-pose-estimation-adas-0001/FP32/head-pose-estimation-adas-0001.bin", args.device, args.extension) mouse_control = MouseController('medium', 'fast') input_feeder.load_data() face_d.load_model() face_l.load_model() gaze.load_model() head.load_model() count = 0 f_count = 0 inf_time = 0 for _, frame in input_feeder.next_batch(): if not _: break; if frame is not None: f_count += 1 if f_count%5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) crop_face, face_coords = face_d.predict(frame, 0.5) if isinstance(crop_face, int): log.info("No face in frame") if key == 27: break continue head_pose = head.predict(crop_face) le_eye, ri_eye, eye_coords = face_l.predict(crop_face) new_mouse_coord, gaze_vector = gaze.predict(le_eye, ri_eye, head_pose) count = count + 1 if (not len(visual) == 0): preview_window = frame.copy() if 'face' in visual: if len(visual) != 1: preview_window = crop_face else: cv2.rectangle(preview_window, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 150, 0), 3) if 'facel' in visual: if not 'face' in visual: preview_window = crop_face.copy() cv2.rectangle(preview_window, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0,255,0), 3) cv2.rectangle(preview_window, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0,255,0), 3) if 'head' in visual: cv2.putText( preview_window, "Pose Angles: pitch:{:.2f} , roll:{:.2f} , yaw:{:.2f}".format(head_pose[0], head_pose[1], head_pose[2]), (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA ) if 'gaze' in visual: if not 'face' in visual: preview_window = crop_face.copy() x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(le_eye, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(ri_eye, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) preview_window[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le preview_window[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re if len(visual) != 0: img_h = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_window, (500, 500)))) else: img_h = cv2.resize(frame, (500, 500)) cv2.imshow('Visuals', img_h) if f_count%5 == 0: mouse_control.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break log.info("End of session.") cv2.destroyAllWindows() input_feeder.close()
class Computer_Pointer_Controller: def __init__(self, args): # load the objects corresponding to the models self.face_detection = Face_Detection(args.face_detection_model, args.device, args.extensions, args.perf_counts) self.gaze_estimation = Gaze_Estimation(args.gaze_estimation_model, args.device, args.extensions, args.perf_counts) self.head_pose_estimation = Head_Pose_Estimation( args.head_pose_estimation_model, args.device, args.extensions, args.perf_counts) self.facial_landmarks_detection = Facial_Landmarks_Detection( args.facial_landmarks_detection_model, args.device, args.extensions, args.perf_counts) start_models_load_time = time.time() self.face_detection.load_model() self.gaze_estimation.load_model() self.head_pose_estimation.load_model() self.facial_landmarks_detection.load_model() logger = logging.getLogger() input_T = args.input_type input_F = args.input_file if input_T.lower() == 'cam': # open the video feed self.feed = InputFeeder(args.input_type, args.input_file) self.feed.load_data() else: if not os.path.isfile(input_F): logger.error('Unable to find specified video file') exit(1) file_extension = input_F.split(".")[-1] if (file_extension in ['jpg', 'jpeg', 'bmp']): self.feed = InputFeeder(args.input_type, args.input_file) self.feed.load_data() elif (file_extension in ['avi', 'mp4']): self.feed = InputFeeder(args.input_type, args.input_file) self.feed.load_data() else: logger.error( "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']" ) exit(1) print("Models total loading time :", time.time() - start_models_load_time) # init mouse controller self.mouse_controller = MouseController('low', 'fast') def run(self): inferences_times = [] face_detections_times = [] for batch in self.feed.next_batch(): if batch is None: break # as we want the webcam to act as a mirror, flip the frame batch = cv2.flip(batch, 1) inference_time = time.time() face = self.face_detection.predict(batch) if face is None: logger.error('Unable to detect the face.') continue else: face_detections_times.append(time.time() - inference_time) left_eye_image, right_eye_image = self.facial_landmarks_detection.predict( face) if left_eye_image is None or right_eye_image is None: continue head_pose_angles = self.head_pose_estimation.predict(face) if head_pose_angles is None: continue vector = self.gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles) inferences_times.append(time.time() - inference_time) if args.show_face == "True": cv2.imshow("Detected face", face) cv2.waitKey(1) self.mouse_controller.move(vector[0], vector[1]) self.feed.close() cv2.destroyAllWindows() print("Average face detection inference time:", sum(face_detections_times) / len(face_detections_times)) print("Average total inferences time:", sum(inferences_times) / len(inferences_times))
def main(args): start_model_load_time=time.time() # load model class_face_detection = ModelFaceDetection(args.model_face_detection, args.device, args.threshold) class_face_detection.load_model() class_head_pose_estimation = ModelHeadPoseEstimation(args.model_head_pose_estimation, args.device) class_head_pose_estimation.load_model() class_facial_landmarks_detection = ModelFacialLandmarksDetection(args.model_facial_landmarks_detection, args.device) class_facial_landmarks_detection.load_model() class_gaze_estimation = ModelGazeEstimation(args.model_gaze_estimation, args.device) class_gaze_estimation.load_model() total_model_load_time = time.time() - start_model_load_time # input image feed=InputFeeder(input_type='video', input_file=args.input_path) feed.load_data() # output initial_w, initial_h, initial_fps = feed.get_info() counter = 0 start_inference_time = time.time() # debug #print("initial_w:{}, initial_h:{}, initial_fps:{}".format(initial_w, initial_h, initial_fps)) #out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), initial_fps, (initial_w, initial_h), True) out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (initial_w, initial_h), True) class_face_detection.initial_size(initial_w, initial_h) #mc = MouseController(precision='low', speed='slow') mc = MouseController(precision='high', speed='fast') for flag, batch in feed.next_batch(): if not flag: break counter += 1 # debug #print("batch.shape:{}".format(batch.shape)) # if batch is not None: # face_detection cropped_face = class_face_detection.predict(batch) # head_pose_estimation head_pose_angles = class_head_pose_estimation.predict(cropped_face) # debug #print("angle_y_fc:{}, angle_p_fc:{}, angle_r_fc:{}".format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2])) # facial_landmarks_detection left_eye_image, right_eye_image, left_eye_center, right_eye_center= class_facial_landmarks_detection.predict(cropped_face) # gaze_estimation x, y, gaze_vector = class_gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles) cv2.line(cropped_face, left_eye_center, (int(left_eye_center[0] + gaze_vector[0] * 100), int(left_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2) cv2.line(cropped_face, right_eye_center, (int(right_eye_center[0] + gaze_vector[0] * 100), int(right_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2) # output cv2.imshow('output', batch) cv2.waitKey(30) cv2.imwrite('output.jpg', batch); out_video.write(batch) # MouseController mc.move(x, y) total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = counter/total_inference_time print("total_model_load_time:{}, total_inference_time:{}, fps:{}".format(total_model_load_time, total_inference_time, fps)) feed.close() cv2.destroyAllWindows()