def run( self, input_type=None, input_file=None, ): if input_type and input_file: self.input_ = InputFeeder(input_type, input_file) self.input_.load_data() if self.save_video: out = cv2.VideoWriter( 'output.mp4', 0x00000021, 30, (int(self.input_.cap.get(3)), int(self.input_.cap.get(4)))) try: fc_dec_inf_time = 0 landmark_inf_time = 0 pose_inf_time = 0 gaze_inf_time = 0 frame_counter = 0 while True: # Read the next frame try: frame = next(self.input_.next_batch()) frame_counter += 1 except StopIteration: break key_pressed = cv2.waitKey(60) # face detection start = time.time() out_frame, boxes = self.face_dec.predict(frame, display_output=True) fc_dec_inf_time += (time.time() - start) #for each box for box in boxes: face = out_frame[box[1]:box[3], box[0]:box[2]] start = time.time() out_frame, left_eye_point, right_eye_point = self.fac_land.predict( out_frame, face, box, display_output=True) landmark_inf_time += (time.time() - start) start = time.time() out_frame, headpose_angels = self.head_pose.predict( out_frame, face, box, display_output=True) pose_inf_time += (time.time() - start) start = time.time() out_frame, gazevector = self.gaze.predict( out_frame, face, box, left_eye_point, right_eye_point, headpose_angels, display_output=True) gaze_inf_time += (time.time() - start) if self.show_video: cv2.imshow('im', out_frame) if self.save_video: out.write(out_frame) if self.mouse_con: self.mouse_con.move(gazevector[0], gazevector[1]) time.sleep(1) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break if self.save_video: out.release() self.input_.close() cv2.destroyAllWindows() print( 'average inference time for face detection model is :- {:2f}ms' .format((fc_dec_inf_time / frame_counter) * 1000)) print( 'average inference time for facial landmark model is :- {:2f}ms' .format((landmark_inf_time / frame_counter) * 1000)) print( 'average inference time for head pose estimation model is :- {:2f}ms' .format((pose_inf_time / frame_counter) * 1000)) print( 'average inference time for gaze estimation model is :- {:2f}ms' .format((gaze_inf_time / frame_counter) * 1000)) except Exception as ex: logging.exception("Error in inference: " + str(ex))
def main(args): #model=args.model fd_model = args.face flmd_model = args.landmarks hp_model = args.head ge_model = args.gaze device = args.device display_flag = args.display # Init and load models fd = FaceDetection(fd_model, device) logger.info("######## Model loading Time #######") start = time.time() fd.load_model() logger.info("Face Detection Model: {:.1f}ms".format(1000 * (time.time() - start))) flmd = FacialLandMarksDetection(flmd_model, device) start = time.time() flmd.load_model() logger.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start))) hpe = HeadPoseEstimation(hp_model, device) start = time.time() hpe.load_model() logger.info("HeadPose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start))) ge = GazeEstimation(ge_model, device) start = time.time() ge.load_model() logger.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start))) # Mouse controller mc = MouseController("low", "fast") feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) feed.load_data() frame_count = 0 fd_inference_time = 0 lm_inference_time = 0 hp_inference_time = 0 ge_inference_time = 0 move_mouse = False for batch in feed.next_batch(): frame_count += 1 # Preprocessed output from face detection face_boxes, image, fd_time = fd.predict(batch, display_flag) fd_inference_time += fd_time for face in face_boxes: cropped_face = batch[face[1]:face[3], face[0]:face[2]] #print(f"Face boxe = {face}") # Get preprocessed result from landmarks image, left_eye, right_eye, lm_time = flmd.predict( image, cropped_face, face, display_flag) lm_inference_time += lm_time # Get preprocessed result from pose estimation image, headpose_angels, hp_time = hpe.predict( image, cropped_face, face, display_flag) hp_inference_time += hp_time # Get preprocessed result from Gaze estimation model image, gazevector, ge_time = ge.predict(image, cropped_face, face, left_eye, right_eye, headpose_angels, display_flag) #cv2.imshow('Face', cropped_face) ge_inference_time += ge_time #print(f"Gaze vect {gazevector[0],gazevector[1]}") cv2.imshow('img', image) if (not move_mouse): mc.move(gazevector[0], gazevector[1]) break if cv2.waitKey(1) & 0xFF == ord("k"): break if (frame_count > 0): logger.info("###### Models Inference time ######") logger.info( f"Face Detection inference time = {(fd_inference_time*1000)/frame_count} ms" ) logger.info( f"Facial Landmarks Detection inference time = {(lm_inference_time*1000)/frame_count} ms" ) logger.info( f"Headpose Estimation inference time = {(hp_inference_time*1000)/frame_count} ms" ) logger.info( f"Gaze estimation inference time = {(ge_inference_time*1000)/frame_count} ms" ) feed.close()
def main(args): start_model_load_time=time.time() # load model class_face_detection = ModelFaceDetection(args.model_face_detection, args.device, args.threshold) class_face_detection.load_model() class_head_pose_estimation = ModelHeadPoseEstimation(args.model_head_pose_estimation, args.device) class_head_pose_estimation.load_model() class_facial_landmarks_detection = ModelFacialLandmarksDetection(args.model_facial_landmarks_detection, args.device) class_facial_landmarks_detection.load_model() class_gaze_estimation = ModelGazeEstimation(args.model_gaze_estimation, args.device) class_gaze_estimation.load_model() total_model_load_time = time.time() - start_model_load_time # input image feed=InputFeeder(input_type='video', input_file=args.input_path) feed.load_data() # output initial_w, initial_h, initial_fps = feed.get_info() counter = 0 start_inference_time = time.time() # debug #print("initial_w:{}, initial_h:{}, initial_fps:{}".format(initial_w, initial_h, initial_fps)) #out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), initial_fps, (initial_w, initial_h), True) out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (initial_w, initial_h), True) class_face_detection.initial_size(initial_w, initial_h) #mc = MouseController(precision='low', speed='slow') mc = MouseController(precision='high', speed='fast') for flag, batch in feed.next_batch(): if not flag: break counter += 1 # debug #print("batch.shape:{}".format(batch.shape)) # if batch is not None: # face_detection cropped_face = class_face_detection.predict(batch) # head_pose_estimation head_pose_angles = class_head_pose_estimation.predict(cropped_face) # debug #print("angle_y_fc:{}, angle_p_fc:{}, angle_r_fc:{}".format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2])) # facial_landmarks_detection left_eye_image, right_eye_image, left_eye_center, right_eye_center= class_facial_landmarks_detection.predict(cropped_face) # gaze_estimation x, y, gaze_vector = class_gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles) cv2.line(cropped_face, left_eye_center, (int(left_eye_center[0] + gaze_vector[0] * 100), int(left_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2) cv2.line(cropped_face, right_eye_center, (int(right_eye_center[0] + gaze_vector[0] * 100), int(right_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2) # output cv2.imshow('output', batch) cv2.waitKey(30) cv2.imwrite('output.jpg', batch); out_video.write(batch) # MouseController mc.move(x, y) total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = counter/total_inference_time print("total_model_load_time:{}, total_inference_time:{}, fps:{}".format(total_model_load_time, total_inference_time, fps)) feed.close() cv2.destroyAllWindows()
def initialize_feed(self): self.feed = InputFeeder(self.args.input_type, self.args.input) self.feed.load_data()
def main(): args = build_argparser().parse_args() previewFlags = args.previewFlags logger = logging.getLogger() inputFile = args.input inputFeeder = None if inputFile.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFile): logger.error("Unable to find input file") exit(1) inputFeeder = InputFeeder("video", inputFile) start_loading = time.time() mfd = Model_Face_Detection(args.facedetectionmodel, args.device, args.cpu_extension) mfld = Model_Facial_Landmarks_Detection(args.faciallandmarkmodel, args.device, args.cpu_extension) mge = Model_Gaze_Estimation(args.gazeestimationmodel, args.device, args.cpu_extension) mhpe = Model_Head_Pose_Estimation(args.headposemodel, args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() mfd.load_model() mfld.load_model() mge.load_model() mhpe.load_model() model_loading_time = time.time() - start_loading counter = 0 frame_count = 0 inference_time = 0 start_inf_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break if frame is not None: frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) start_inference = time.time() croppedFace, face_coords = mfd.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("No face detected.") if key == 27: break continue hp_out = mhpe.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = mfld.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = mge.predict(left_eye, right_eye, hp_out) stop_inference = time.time() inference_time = inference_time + stop_inference - start_inference counter += 1 if (not len(previewFlags) == 0): preview_window = frame.copy() if 'fd' in previewFlags: preview_window = croppedFace if 'fld' in previewFlags: cv2.rectangle( croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle( croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in previewFlags: cv2.putText( preview_window, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re cv2.imshow("visualization", cv2.resize(preview_window, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break fps = frame_count / inference_time logger.error("Total loading time: " + str(model_loading_time) + " seconds") logger.error("total inference time {} seconds".format(inference_time)) logger.error("Average inference time: " + str(inference_time / frame_count) + " seconds") logger.error("{} fps".format(fps / 5)) cv2.destroyAllWindows() inputFeeder.close()
def main(args): #init mouse controller class mouse_controller = MouseController('low', 'medium') log.debug('Init model classes') #init model classes face_Detect = FaceDetection(args.face_detection, args.device) land_Marks = FaceLandmarks(args.landmarks, args.device) head_PoseEstimat = headPoseEstimation(args.head_pose_estimation, args.device) gaze_Estimation = gazeEstimation(args.gaze_estimation, args.device) #init input feeder class feed = InputFeeder(input_type=args.input_feed, input_file=args.path_feed) log.info('load input source ...') #load data input source from either image, video, cam according to the parameters passed by the user or the default one (video) cap = feed.load_data() #load video save parameters feed.load_video_save_params(name_export_video='output_video.mp4') #get the Height and width from the input source initial_w, initial_h = feed.get_input_size() #Facedetection threshold prob from args THRESHOLD = args.prob_threshold log.info('Run models inferences ...') while (cap.isOpened()): for ret, frame in feed.next_batch(): if ret == True: #flip image frame = utils.flip_image_vertical(frame) #copy unmodifed frame original_frame = np.copy(frame) #Set facedetecetion parameters face_Detect.set_params(frame, THRESHOLD, initial_w, initial_h) #Run facedetection inference confidence, data_face_detection_points = face_Detect.get_inference_outputs( ) if confidence >= THRESHOLD: #Crop main frame with face detection coordinates use to draw the rectangle cropped_frame, cropped_h, cropped_w = utils.crop_frame( frame, data_face_detection_points[1], data_face_detection_points[3], data_face_detection_points[0], data_face_detection_points[2]) land_Marks.set_params(cropped_frame, cropped_h, cropped_w) left_eye_center_points, right_eye_center_points, data_l_eye, data_r_eye, data_points_marks = land_Marks.get_inference_outputs( ) #Use the x,y points from face detection to display visualisation at the right position xomin = data_face_detection_points[0] yomin = data_face_detection_points[1] #Crop left eye from data generated by landmarks detection model img_left_eye, _, _ = utils.crop_frame( frame, data_l_eye[1] + yomin, data_l_eye[3] + yomin, data_l_eye[0] + xomin, data_l_eye[2] + xomin) #Crop right eye from data generated by landmarks detection model img_right_eye, _, _ = utils.crop_frame( frame, data_r_eye[1] + yomin, data_r_eye[3] + yomin, data_r_eye[0] + xomin, data_r_eye[2] + xomin) #Head pose estmisation model face detection copped_frame output (roll, pitch, yaw) head_PoseEstimat.set_params(cropped_frame, cropped_w, cropped_h) head_pose_angles = head_PoseEstimat.get_inference_outputs() #Gaze estimation model output vector for eyes direction gaze_Estimation.set_params(img_left_eye, img_right_eye, head_pose_angles) gaze_vector_output = gaze_Estimation.get_inference_outputs( ) #### #eyes_concat = np.concatenate((img_left_eye,img_right_eye), axis=0) #eyes_concat_resized = cv2.resize(eyes_concat,(cropped_frame.shape[1] -200 ,cropped_frame.shape[0]), interpolation=cv2.INTER_AREA) #eyes_crop_out = np.concatenate((cropped_frame, eyes_concat_resized), axis=1) #display_visual = True #Display visualisation according to user cli arguments if args.display_visual == "True": #original_frame = cv2.resize(original_frame,(cropped_frame.shape[1] +400 ,cropped_frame.shape[0]), interpolation=cv2.INTER_AREA) #img_output = np.concatenate((original_frame,cropped_frame), axis=1) frame = utils.draw_visualisation( frame, data_face_detection_points, data_points_marks, head_pose_angles, data_l_eye, data_r_eye, gaze_vector_output) else: frame = original_frame #show the frame(s) in realtime cv2.imshow('frame', frame) #if the user chose "image" as input will be saved if args.input_feed == 'image': cv2.imwrite("../bin/output.jpg", frame) #if the user chose "video" as input will be saved if args.input_feed == 'video' or 'cam': #save the feed to video feed.save_to_video(frame) if args.mouse_move == "True": pass mouse_controller.move(*gaze_vector_output[:2]) if cv2.waitKey(1) & 0xFF == ord('q'): break else: break # Release everything if job is finished feed.close() log.info('End inferences ...')
def infer_on_stream(args): face_detection_model_file = args.faceDetectionModel facial_landmarks_detection_model_file = args.facialLandmarksModel head_pose_estimation_model_file = args.headPoseModel gaze_estimation_model_file = args.gazeModel video_file = args.input device_name = args.device cpu_extension = args.cpu_extension prob_threshold = args.prob_threshold preview_flag = args.preview_flag output_path = args.output_path if not os.path.exists(output_path): os.mkdir(output_path) mouse_control = MouseController("low", "fast") try: logging.info("*********** Model Load Time ***************") start_model_load_time = time.time() start_time = time.time() face_detection_model = FaceDetectionModel(face_detection_model_file, device_name, cpu_extension) logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() facial_landmarks_detection_model = FacialLandmarksDetectionModel( facial_landmarks_detection_model_file, device_name, cpu_extension) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() head_pose_estimation_model = HeadPoseEstimationModel( head_pose_estimation_model_file, device_name, cpu_extension) logging.info("Head Pose Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file, device_name, cpu_extension) logging.info("Gaze Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) total_model_load_time = time.time() - start_model_load_time logging.info("*********** Model Load Completed ***********") except Exception as e: logging.error("ERROR in model loading: " + str(e)) sys.exit(1) feeder = InputFeeder('video', video_file) feeder.load_data() out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.fps() / 10), (1920, 1080), True) start_inference_time = 0 frame_count = 0 face_detect_infer_time = 0 facial_landmarks_infer_time = 0 head_pose_infer_time = 0 gaze_infer_time = 0 while True: try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 ## Face Detecton Model image = face_detection_model.preprocess_input(frame) start_time = time.time() outputs = face_detection_model.predict(image) face_detect_infer_time += (time.time() - start_time) out_frame, faces = face_detection_model.preprocess_output( outputs, frame, preview_flag, prob_threshold) for face in faces: crop_image = frame[face[1]:face[3], face[0]:face[2]] ## Facial Landmarks Detecton Model image = facial_landmarks_detection_model.preprocess_input( crop_image) start_time = time.time() outputs = facial_landmarks_detection_model.predict(image) facial_landmarks_infer_time += (time.time() - start_time) out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output( outputs, out_frame, face, preview_flag) ## Head Pose Estimation Model image = head_pose_estimation_model.preprocess_input(crop_image) start_time = time.time() outputs = head_pose_estimation_model.predict(image) head_pose_infer_time += (time.time() - start_time) out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output( outputs, out_frame, preview_flag) ## Gaze Estimation Model out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input( out_frame, crop_image, left_eye_point, right_eye_point) start_time = time.time() outputs = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels_list) gaze_infer_time += (time.time() - start_time) out_frame, gazevector = gaze_estimation_model.preprocess_output( outputs, out_frame, face, left_eye_point, right_eye_point, preview_flag) cv2.imshow("Computer Pointer Control", out_frame) out_video.write(out_frame) mouse_control.move(gazevector[0], gazevector[1]) if key_pressed == 27: break if frame_count > 0: logging.info("*********** Model Inference Time ****************") logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * face_detect_infer_time / frame_count)) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * facial_landmarks_infer_time / frame_count)) logging.info("Head Pose Detection Model: {:.1f} ms.".format( 1000 * head_pose_infer_time / frame_count)) logging.info("Gaze Detection Model: {:.1f} ms.".format( 1000 * gaze_infer_time / frame_count)) logging.info("*********** Model Inference Completed ***********") total_infer_time = time.time() - start_inference_time total_inference_time = round(total_infer_time, 1) fps = frame_count / total_inference_time with open(os.path.join(output_path, 'stats.txt'), 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logging.info("*********** Total Summary ****************") logging.info(f"Total Model Load Time: {total_model_load_time}") logging.info(f"Total Inference Time: {total_inference_time}") logging.info(f"FPS: {fps}") logging.info("*********** Total Summary ***********") logging.info("*********** ************************* ***********") feeder.close() cv2.destroyAllWindows()
hd = head_pose_estimation.head_pose_estimation(head_pose_estimation_model, args.device) hd.load_model() hd.check_model() hd.get_input_name() gaze_estimation_model = args.gaze_estimation_model ge = gaze_estimation.gaze_estimation(gaze_estimation_model, args.device) ge.load_model() ge.check_model() ge.get_input_name() #initialize mouse controller mouse_controller = MouseController(args.mouse_precision, args.mouse_speed) if (args.inputType == 'image'): input_image = args.input_path feed = InputFeeder(input_type='image', input_file=input_image) feed.load_data() frame = feed.cap _, output_img = process_frame(frame, args.visualize) cv2.imshow("Preview", output_img) cv2.imwrite(args.output_path, output_img) elif (args.inputType == 'video'): process_video(args.input_path, args.output_path, args.visualize) elif (args.inputType == 'cam'): process_video(None, args.output_path, args.visualize) else: print("Invalid input type")
def main(): args = build_argparser().parse_args() inputFilePath = args.input inputFeeder = None if args.input == "CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video", args.input) modelPathDict = { 'FaceDetectionModel': args.face_detection_model, 'FacialLandmarksDetectionModel': args.facial_landmark_model, 'GazeEstimationModel': args.gaze_estimation_model, 'HeadPoseEstimationModel': args.head_pose_model } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified " + fileNameKey + " xml file") sys.exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') start_time_1 = time.time() inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() total_model_load_time = (time.time() - start_time_1) print("Model Load Time: {:.3f}".format(total_model_load_time)) frame_count = 0 start_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (450, 450))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: log.info("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break log.info("VideoStream has ended.") cv2.destroyAllWindows() inputFeeder.close() total_time = time.time() - start_time total_inference_time = total_time fps = frame_count / total_inference_time print("Inference Time: {:.3f}".format(total_inference_time)) print("FPS: {}".format(fps))
def main(): args = get_args() log.basicConfig(filename='example.log', level=log.DEBUG) inputFile = args.input #inputFile = "./bin/demo.mp4" mouse = MouseController("high", "fast") frame_count = 0 focal_length = 950.0 scale = 50 #print(f"Visual flag: {args.visual_flag}") if inputFile.lower() == "cam": feed = InputFeeder('cam') log.info("Video source: " + str(inputFile)) else: if not os.path.isfile(inputFile): log.error("Unable to find file: " + inputFile) exit(1) feed = InputFeeder("video", inputFile) log.info("Video source: " + str(inputFile)) log.info("InputFeeder initialized") log.info("Device: " + str(args.device)) log.info("Face detection model: " + str(args.facedetectionmodel)) log.info("Facial landmarks model: " + str(args.faciallandmarksmodel)) log.info("Head pose estimation model: " + str(args.headposemodel)) log.info("Gaze estimation model: " + str(args.gazeestimationmodel)) if args.stats == 1: print("Running statistics...") inference_times = [] fdm_inference_times = [] hpm_inference_times = [] flm_inference_times = [] gem_inference_times = [] start_time = time.time() # Create instances of the different models fdm = FaceDetector(args.facedetectionmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() fdm.load_model() fdm_load_time = time.time() - start_time else: fdm.load_model() fdm.check_model() hpm = HeadPoseEstimator(args.headposemodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() hpm.load_model() hpm_load_time = time.time() - start_time else: hpm.load_model() hpm.check_model() flm = FacialLandmarksDetector(args.faciallandmarksmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() flm.load_model() flm_load_time = time.time() - start_time else: flm.load_model() flm.check_model() gem = GazeEstimator(args.gazeestimationmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() gem.load_model() gem_load_time = time.time() - start_time else: gem.load_model() gem.check_model() if args.stats == 1: duration_loading = time.time() - start_time print( f"Duration for loading and checking the models: {duration_loading}" ) log.info( f"Duration for loading and checking the models: {duration_loading}" ) cv2.namedWindow('preview', cv2.WINDOW_NORMAL) cv2.resizeWindow('preview', 600, 600) feed.load_data() for ret, frame in feed.next_batch(): if not ret: break if frame is not None: frame_count += 1 key = cv2.waitKey(60) if args.stats == 1: start_time = time.time() # Run face detection face_crop, face_coords = fdm.predict(frame.copy()) print("Face crop shape: " + str(face_crop.shape)) frame_h, frame_w = frame.shape[:2] (xmin, ymin, xmax, ymax) = face_coords face_frame = frame[ymin:ymax, xmin:xmax] #center_of_face = (xmin + face_frame.shape[1] / 2, ymin + face_frame.shape[0] / 2, 0) # 0 for colour channel #print("Center of face " + str(center_of_face)) try: # Check if face was detected if type(face_coords) == int: print("Unable to detect face") if key == 27: break continue # Facial landmark detection left_eye_crop, right_eye_crop, landmarks, crop_coords = flm.predict( face_crop.copy()) #print("Landmarks" +str(landmarks)) left_eye = (landmarks[0], landmarks[1]) right_eye = (landmarks[2], landmarks[3]) # Landmark position based on complete frame landmarks_viz = landmarks landmarks_viz[0] = landmarks_viz[0] + xmin landmarks_viz[1] = landmarks_viz[1] + ymin landmarks_viz[2] = landmarks_viz[2] + xmin landmarks_viz[3] = landmarks_viz[3] + ymin crop_coords_viz = (crop_coords[0] + xmin, crop_coords[1] + ymin, crop_coords[2] + xmin, crop_coords[3] + ymin, crop_coords[4] + xmin, crop_coords[5] + ymin, crop_coords[6] + xmin, crop_coords[7] + ymin) left_eye_viz = (landmarks_viz[0], landmarks_viz[1]) right_eye_viz = (landmarks_viz[2], landmarks_viz[3]) third_eye_viz_x = (landmarks_viz[2] - landmarks_viz[0]) / 2 + landmarks_viz[0] third_eye_viz_y = (landmarks_viz[3] - landmarks_viz[1]) / 2 + landmarks_viz[1] third_eye_viz = (third_eye_viz_x, third_eye_viz_y) #print(landmarks_viz[0], landmarks_viz[2], third_eye_viz_x) # Head pose estimation head_pose = hpm.predict(face_crop.copy()) print("Head pose: " + str(head_pose)) (yaw, pitch, roll) = head_pose frame = display_head_pose(frame, pitch, roll, yaw) # Send inputs to GazeEstimator gaze_vector = gem.predict(head_pose, left_eye_crop, right_eye_crop) if args.stats == 1: inference_time = time.time() - start_time inference_times.append(inference_time) print(gaze_vector) frame = display_gaze(frame, gaze_vector) # Control the mouse if frame_count % 5 == 0: mouse_x, mouse_y = get_mouse_vector(gaze_vector, roll) print("Mouse vector:" + str(mouse_x) + " - " + str(mouse_y)) mouse.move(mouse_x, mouse_y) currentMouseX, currentMouseY = pyautogui.position() print("Mouse coordinates: " + str(currentMouseX) + ", " + str(currentMouseY)) if args.visual_flag == 1: frame = draw_bounding_box(frame, face_coords) left_eye_frame = crop_coords_viz[0:4] right_eye_frame = crop_coords_viz[4:] frame = draw_bounding_box(frame, left_eye_frame) frame = draw_bounding_box(frame, right_eye_frame) frame = visualize_landmark(frame, left_eye_viz) frame = visualize_landmark(frame, right_eye_viz, color=(0, 0, 255)) frame = visualize_gaze(frame, gaze_vector, landmarks_viz) # visualize the axes of the HeadPoseEstimator results #frame = hpm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length) frame = hpm.draw_axes(frame.copy(), third_eye_viz, yaw, pitch, roll, scale, focal_length) #hdm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length) cv2.imshow('preview', frame) cv2.imshow('left eye', left_eye_crop) cv2.imshow('right eye', right_eye_crop) except Exception as e: print("Unable to predict using model" + str(e) + " for frame " + str(frame_count)) log.error("Unable to predict using model" + str(e) + " for frame " + str(frame_count)) continue if args.stats == 1: avg_inference_time = sum(inference_times) / len(inference_times) print("Average inference time: " + str(avg_inference_time)) log.info("Average inference time: " + str(avg_inference_time)) log.info("Load time for face detection model: " + str(fdm_load_time)) log.info("Load time for facial landmarks model: " + str(flm_load_time)) log.info("Load time for head pose detection model: " + str(hpm_load_time)) log.info("Load time for gaze estimation model: " + str(gem_load_time)) cv2.destroyAllWindows() feed.close()
def main(args): # set log level levels = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR } log_level = levels.get(args.log_level, logging.ERROR) logging.basicConfig(level=log_level) mouse_control = MouseController('high', 'fast') logging.info("Model Loading Please Wait ..") face_det = FaceDetection(args.face_detection, args.device) facial_det = FaceLandmark(args.face_landmark, args.device) head_pose_est = HeadPoseEstimation(args.head_pose, args.device) gaze_est = GazeEstimation(args.gaze_estimation, args.device) logging.info("Model loading successfully") inp = InputFeeder(input_type='video', input_file=args.input) inp.load_data() face_det.load_model() facial_det.load_model() head_pose_est.load_model() gaze_est.load_model() video_writer = cv2.VideoWriter(args.output_dir + '/demo_output11.mp4', cv2.VideoWriter_fourcc(*'MPEG'), 15, (1920, 1080), True) cv2.namedWindow('gaze') for frame in inp.next_batch(): try: frame.shape except Exception as err: break crop_face, crop_coords = face_det.predict(frame, visualize=args.visualize) left_eye, right_eye, left_eye_crop, right_eye_crop = facial_det.predict( crop_face, visualize=args.visualize) head_pose = head_pose_est.predict(crop_face, visualize=args.visualize) (new_x, new_y), gaze_vector = gaze_est.predict(left_eye_crop, right_eye_crop, head_pose) left_eye_gaze = int(left_eye[0] + gaze_vector[0] * 100), int(left_eye[1] - gaze_vector[1] * 100) right_eye_gaze = int(right_eye[0] + gaze_vector[0] * 100), int(right_eye[1] - gaze_vector[1] * 100) cv2.arrowedLine(crop_face, left_eye, left_eye_gaze, (0, 0, 255), 2) cv2.arrowedLine(crop_face, right_eye, right_eye_gaze, (0, 0, 255), 2) video_writer.write(frame) mouse_control.move(new_x, new_y) if args.show_result: cv2.imshow('gaze', frame) cv2.waitKey(1) inp.close() video_writer.release() cv2.destroyAllWindows()
def main(): """ """ # Grab command line args args = build_argparser().parse_args() input_src = args.input device = args.device extension = args.cpu_extension prob_threshold = args.prob_threshold face_detection_model = args.facedetectionmodel head_pose_model = args.headposemodel landmarks_model = args.facelandmarksnmodel gaze_estimation_model = args.gazeestimationmodel # Create log object set for console output and set log level log_obj = log.getLogger() log_obj.setLevel(LOGLEVEL) console_handler = log.StreamHandler() console_handler.setLevel(LOGLEVEL) log_obj.addHandler(console_handler) # Create detection objects face_detection_obj = FaceDetectionModel(face_detection_model, device, extension) head_pose_obj = HeadPoseModel(head_pose_model, device, extension) landmarks_obj = LandmarksModel(landmarks_model, device, extension) gaze_estimation_obj = GazeEstimationModel(gaze_estimation_model, device, extension) # Create mouse controller object mouse_controller = MouseController('medium', 'fast') # Place mouse at the center of the screen mouse_controller.init_position() log_obj.info("[Info]: Place mouse at the center of the screen") # Place holder for total inferencing time total_inference_time = 0 # Load models and get the model loading times start_time = time.time() face_detection_obj.load_model() end_time = time.time() face_detection_loading_time = end_time - start_time start_time = time.time() head_pose_obj.load_model() end_time = time.time() head_pose_loading_time = end_time - start_time start_time = time.time() landmarks_obj.load_model() end_time = time.time() landmarks_detection_loading_time = end_time - start_time start_time = time.time() gaze_estimation_obj.load_model() end_time = time.time() gaze_estimation_loading_time = end_time - start_time # Configure input video source if input_src.lower() == 'cam': input_channel = InputFeeder(input_type='cam') elif not os.path.exists(input_src): log.error("Video file not found! Exiting....") exit(1) else: input_channel = InputFeeder(input_type='video', input_file=input_src) log_obj.info("[Info]: Opening video file ...") input_channel.load_data() video_width = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) video_height = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_channel.cap.get(cv2.CAP_PROP_FPS)) frame_counter = 0 total_face_inf_time = 0 total_head_inf_time = 0 total_lanmarks_inf_time = 0 total_gaze_inf_time = 0 frame_processing_time = 0 # Process each frame try: for frame in input_channel.next_batch(): frame_processing_start_time = time.time() frame_counter = frame_counter + 1 key = cv2.waitKey(60) # Use face detection to find cropped face and provide face coordinates cropped_face, face_coords, face_inference_time = face_detection_obj.predict( frame, prob_threshold) total_face_inf_time = total_face_inf_time + face_inference_time # Now use cropped face for head pose detection head_pose_estimate, head_inference_time = head_pose_obj.predict( cropped_face, prob_threshold) total_head_inf_time = total_head_inf_time + head_inference_time # Now use cropped face for landmarks detection cropped_left_eye, cropped_right_eye, eyes_coords, converted_landmarks, landmarks_inference_time = landmarks_obj.predict( cropped_face, prob_threshold) total_lanmarks_inf_time = total_lanmarks_inf_time + landmarks_inference_time # Finally gaze estimation gaze_vector, gaze_estimate_time = gaze_estimation_obj.predict( cropped_left_eye, cropped_right_eye, head_pose_estimate) total_gaze_inf_time = total_gaze_inf_time + gaze_estimate_time # Move the mouse #mouse_controller.move(gaze_vector[0], gaze_vector[1]) # Show size-reduced frame for visual comparison # Check potential visualize flags: 'F', 'H', 'L', 'G' # If flag exist, process image to show inference results if args.visualize is not None: visualize_flag = str(args.visualize) # Draw bounding box around detected face if 'F' in visualize_flag: cv2.rectangle(frame, (face_coords[0][0], face_coords[0][1]), (face_coords[0][2], face_coords[0][3]), (0, 255, 0), 2) # Show head pose parameters if 'H' in visualize_flag: cv2.putText( frame, "Head pose: yaw: {:.3f}, pitch: {:.3f}, roll: {:.3f}". format(head_pose_estimate[0], head_pose_estimate[1], head_pose_estimate[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) # Draw dots on detected facial landmarks if 'L' in visualize_flag: cv2.circle(frame, (converted_landmarks[0] + face_coords[0][0], converted_landmarks[1] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[2] + face_coords[0][0], converted_landmarks[3] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[4] + face_coords[0][0], converted_landmarks[5] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[6] + face_coords[0][0], converted_landmarks[7] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[8] + face_coords[0][0], converted_landmarks[9] + face_coords[0][1]), 10, (0, 255, 0), 5) # Display gaze parameters if 'G' in visualize_flag: cv2.putText( frame, "Gaze estimate: x: {:.3f}, y: {:.3f}, z: {:.3f}". format(gaze_vector[0], gaze_vector[1], gaze_vector[2]), (10, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) resized_frame = cv2.resize(frame, (640, 360)) cv2.imshow('frame', resized_frame) if frame_counter % 4 == 0: mouse_controller.move(gaze_vector[0], gaze_vector[1]) frame_processing_time = frame_processing_time + ( time.time() - frame_processing_start_time) * 1000 if key == 27: break except Exception as e: #traceback.print_exc() if 'shape' in str(e): log_obj.info("Video feed finished") else: log_obj.error("[ERROR]: " + str(e)) pass # All done, cleaning up cv2.destroyAllWindows() input_channel.close() # Print out statistics log_obj.info("[Info]: Video source FPS: " + str(fps)) log_obj.info("[Info]: Total frame count: " + str(frame_counter)) log_obj.info("") log_obj.info("[Info]: Face detection model loading time: {:.3f} ms".format( face_detection_loading_time * 1000)) log_obj.info("[Info]: Head pose model loading time: {:.3f} ms".format( head_pose_loading_time * 1000)) log_obj.info( "[Info]: Facial landmarks detection model loading time: {:.3f} ms". format(landmarks_detection_loading_time * 1000)) log_obj.info( "[Info]: Gaze estimation model loading time: {:.3f} ms".format( gaze_estimation_loading_time * 1000)) log_obj.info("") log_obj.info( "[Info]: Average per frame total processing time : {:.3f} ms".format( frame_processing_time / frame_counter)) log_obj.info("[Info]: Average face inferencing time: {:.3f} ms".format( total_face_inf_time / frame_counter)) log_obj.info( "[Info]: Average head pose inferencing time: {:.3f} ms".format( total_head_inf_time / frame_counter)) log_obj.info( "[Info]: Average facial landmarks inferencing time: {:.3f} ms".format( total_lanmarks_inf_time / frame_counter)) log_obj.info("[Info]: Average gaze estimate time: {:.3f} ms".format( total_gaze_inf_time / frame_counter))
def run_controller(args): # print(args.save) feeder = None if args.input == "cam": feeder = InputFeeder("cam") elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): if not os.path.isfile(args.input): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder("image", args.input, args.save) else: if not os.path.isfile(args.input): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder("video", args.input, args.save) feeder.load_data() mc = MouseController('medium', 'fast') model_face = Face_Detector() model_face.load_model(args.model_fd, args.device, args.extension) model_pose = Pose_Estimator() model_pose.load_model(args.model_pe, args.device, args.extension) model_landmark = Facial_Landmarks() model_landmark.load_model(args.model_fl, args.device, args.extension) model_gaze = Gaze_Estimator() model_gaze.load_model(args.model_ge, args.device, args.extension) frame_count = 0 for b, frame in feeder.next_batch(): frame_count += 1 preview = np.copy(frame) crop_face, face_count, points = model_face.predict( preview, args.thres_fd) key_pressed = cv2.waitKey(30) if (face_count == 0): if (b or key_pressed == 27): break log.error('no face is detected') feeder.save_file(preview) continue angles = model_pose.predict(preview, crop_face) left_eye, right_eye, eye_points = model_landmark.predict( preview, crop_face, points) mx, my = model_gaze.predict(preview, left_eye, right_eye, angles, eye_points) feeder.save_file(preview) if key_pressed == 27: break if frame_count % 5 == 0: if args.draw_lines: cv2.imshow('video', cv2.resize(preview, (500, 500))) else: cv2.imshow('video', cv2.resize(frame, (500, 500))) mc.move(mx, my) feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() log.debug(args) # Load face detection model faceDetection = ModelFaceDetection(args.face_detection_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() faceDetection.load_model() facedetection_model_load_time = time.time() - start_model_load_time log.debug('Facedetection model load time. {}'.format( facedetection_model_load_time)) #Load Head pose estimation model headPoseEstimation = ModelHeadPoseEstimation( args.headpose_estimation_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() headPoseEstimation.load_model() headposeestimation_model_load_time = time.time() - start_model_load_time log.debug('Head pose estimation model load time. {}'.format( headposeestimation_model_load_time)) #Facial landmark model facialLandmarkDetection = ModelFacialLandmarkDetection( args.landmarks_regression_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() facialLandmarkDetection.load_model() facialLandmarkDetection_model_load_time = time.time( ) - start_model_load_time log.debug('Facial landmarks detection model load time. {}'.format( facialLandmarkDetection_model_load_time)) #Gaze estimation model gazeEstimation = ModelGazeEstimation(args.gaze_estimation_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() gazeEstimation.load_model() gazeEstimation_model_load_time = time.time() - start_model_load_time log.debug('Gaze estimation model load time. {}'.format( gazeEstimation_model_load_time)) # Feeder feeder = InputFeeder(args.input) feeder.load_data() counter = 0 window_name = 'frame' facedetection_inference_time_sum = 0 headpose_inference_time_sum = 0 faciallandmark_inference_time_sum = 0 gazeestimation_inference_time_sum = 0 #Process Framea for frame in feeder.next_batch(): if frame is None: break key_pressed = cv2.waitKey(1) if key_pressed == 27: break #Face detection start_inference_time = time.time() face_image, face_coords = faceDetection.predict(frame) facedetection_inference_time = time.time() - start_inference_time facedetection_inference_time_sum += facedetection_inference_time #Head pose estimation start_inference_time = time.time() yaw, pitch, roll = headPoseEstimation.predict(face_image) headpose_inference_time = time.time() - start_inference_time headpose_inference_time_sum += headpose_inference_time # log.debug('Head pose yaw, pirch ,roll {}, {}, {}'.format(yaw, pitch, roll)) #Facial landmarks detection start_inference_time = time.time() left_eye_image, right_eye_image, eye_coords = facialLandmarkDetection.predict( face_image) faciallandmark_inference_time = time.time() - start_inference_time faciallandmark_inference_time_sum += faciallandmark_inference_time # cv2.imwrite('left_eye.png', left_eye_image) # cv2.imwrite('right_eye.png', right_eye_image) # cv2.imwrite('face.png', face_image) #Gaze estimation start_inference_time = time.time() gaze_vector = gazeEstimation.predict(left_eye_image, right_eye_image, [yaw, pitch, roll]) gazeestimation_inference_time = time.time() - start_inference_time gazeestimation_inference_time_sum += gazeestimation_inference_time #log.debug('Gaze Vector {}, {}'.format(gaze_vector[0], gaze_vector[1])) #Mouse if (counter % 2 == 0): mouse = MouseController('high', 'fast') mouse.move(gaze_vector[0], gaze_vector[1]) #Display frame if (args.show): font = cv2.FONT_HERSHEY_SIMPLEX if 0 < len(face_coords): #face rect fxmin = face_coords[0][0] fymin = face_coords[0][1] fxmax = face_coords[0][2] fymax = face_coords[0][3] cv2.rectangle(frame, (fxmin, fymin), (fxmax, fymax), (200, 0, 0), 2) #eye rect cv2.rectangle( frame, (fxmin + eye_coords[0][0], fymin + eye_coords[0][1]), (fxmin + eye_coords[0][2], fymin + eye_coords[0][3]), (0, 200, 0), 2) cv2.rectangle( frame, (fxmin + eye_coords[1][0], fymin + eye_coords[1][1]), (fxmin + eye_coords[1][2], fymin + eye_coords[1][3]), (0, 200, 0), 2) #Face position length = 100 yaw = math.radians(yaw) pitch = math.radians(-pitch) roll = math.radians(roll) x1 = int(length * (math.cos(yaw) * math.cos(roll))) y1 = int(length * (math.cos(pitch) * math.sin(roll) + math.cos(roll) * math.sin(pitch) * math.sin(yaw))) x2 = int(length * (-math.cos(yaw) * math.sin(roll))) y2 = int(length * (math.cos(pitch) * math.cos(roll) + math.sin(pitch) * math.sin(yaw) * math.sin(roll))) x3 = int(length * (math.sin(yaw))) y3 = int(length * (-math.cos(yaw) * math.sin(pitch))) cv2.line(frame, (fxmin, fymin), (fxmin + x1, fymin + y1), (0, 255, 0), 2) cv2.line(frame, (fxmin, fymin), (fxmin + x2, fymin + y2), (255, 0, 0), 2) cv2.line(frame, (fxmin, fymin), (fxmin + x3, fymin + y3), (0, 0, 255), 2) #gaze x = int(length * gaze_vector[0]) y = -int(length * gaze_vector[1]) cv2.line(frame, (fxmax, fymax), (fxmax + x, fymax + y), (0, 255, 255), 5) else: cv2.putText(frame, 'Face not detected', (10, 10), font, 1, (255, 255, 255), 1) cv2.imshow( window_name, cv2.resize(frame, (int(frame.shape[1] / 3), int(frame.shape[0] / 3)))) counter += 1 log.debug("Face detection inference time average {}".format( facedetection_inference_time_sum / counter)) log.debug("Headpose inference time average {}".format( headpose_inference_time_sum / counter)) log.debug("Faciallandmark inference time average {}".format( faciallandmark_inference_time_sum / counter)) log.debug("Gazeestimation inference time average {}".format( gazeestimation_inference_time_sum / counter)) if (args.show): cv2.destroyWindow(window_name)
def main(): # Grab command line args args = build_argparser().parse_args() previewFlags = args.previewFlags logger = logging.getLogger() inputFilePath = args.input inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) modelPathDict = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) if (not len(previewFlags) == 0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close()
def main(): # Grabing command line args args = build_argparser().parse_args() # Getting Input File Path inputFilePath = args.input # For Visualization visual_flag = args.visualization_flag # Initialize inputfeeder inputFeeder = None # Handle video file or CAM (like webcam) if args.input =="CAM": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): log.info("Unable to find specified video file") sys.exit(1) inputFeeder = InputFeeder("video",args.input) # Now define model path dictionary for all 04 intel pre trained models modelPathDict = {'FaceDetectionModel':args.face_detection_model, 'FacialLandmarksDetectionModel':args.facial_landmark_model, 'GazeEstimationModel':args.gaze_estimation_model, 'HeadPoseEstimationModel':args.head_pose_model} # Check model XML file for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): log.info("Unable to find specified "+fileNameKey+" xml file") sys.exit(1) # Defining Intel Pre Trained Models Objects fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel(modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) # Determining Precision and Speed for mouse controller mc = MouseController('medium','fast') # Loading Input Feeder inputFeeder.load_data() # Loading our four pre trained models and calculate the total models loading time # This will help us to find different model time for different models precison like F32,F16 & INT8 start_time_1= time.time() fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() total_model_load_time= (time.time()-start_time_1) print("Total Model Load Time for All our Intel Pre Trained Models is (in seconds): {:.3f}".format(total_model_load_time)) # Above print statement will give total model load time for our 04 models for different precisions as well frame_count = 0 start_time = time.time() # Start Loop till break through input feeder for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count+=1 if frame_count%5==0: cv2.imshow('video',cv2.resize(frame,(450,450))) key = cv2.waitKey(60) # Extracting face detection features croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace)==int: log.info("Unable to detect the face.") if key==27: break continue # Head position detection hp_out = hpem.predict(croppedFace.copy()) # Landmarks detection (left_eye, right_eye, eyes coordinates) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) # Mouse coordinates and gaze vector Detection new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) # Creating variables for visualisation # Extracting four face coordinates for rectangle (xmin,ymin,xmax,ymax) x_minimum= face_coords[0] y_minimum=face_coords[1] x_maximum=face_coords[2] y_maximum=face_coords[3] # Take eye surrounding area eye_surrounding_area=10 # Now extracting few features from eye coordinates # Extracting four coordinates of left eye from eye coordinates l_l1= eye_coords[0][0] l_l2=eye_coords[0][1] l_l3=eye_coords[0][2] l_l4=eye_coords[0][3] # Extracting four coordinates of left eye from eye coordinates r_r1=eye_coords[1][0] r_r2=eye_coords[1][1] r_r3=eye_coords[1][2] r_r4=eye_coords[1][3] # Extracting pose angle, pitch and roll from head pose output pose_angle= hp_out[0] pitch=hp_out[1] roll=hp_out[2] # Visualizing face, landmarks, head pose and gaze if (not len(visual_flag)==0): preview_frame = frame.copy() if 'fd' in visual_flag: # Drawing a rectangle with our four face coordiantes (xmin,ymin,xmax,ymax) cv2.rectangle(preview_frame, (x_minimum, y_minimum), (x_maximum, y_maximum), (20,20,150), 3) if 'fld' in visual_flag: # Drawing a rectangle for each eyes with the help of eye coordinates and eye surrounding area # Left Eye cv2.rectangle(preview_frame, (l_l1-eye_surrounding_area, l_l2-eye_surrounding_area), (l_l3+eye_surrounding_area, l_l4+eye_surrounding_area), (60,255,0), 2) # Right Eye cv2.rectangle(preview_frame, (r_r1-eye_surrounding_area, r_r2-eye_surrounding_area), (r_r3+eye_surrounding_area, r_r4+eye_surrounding_area), (60,255,0), 2) if 'hp' in visual_flag: # We have extracted pose angle, pitch and roll from head pose output, now we put text on preview_frame cv2.putText(preview_frame, "Pose Angles:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(pose_angle, pitch, roll), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 60), 1) if 'ge' in visual_flag: # Calculating coordinates for left eye to obtain left eye center le_x= (l_l1 + l_l3)/2 le_y= (l_l2 + l_l4)/2 # Calculating coordinates for right eye to obtain right eye center re_x= (r_r1 + r_r3)/2 re_y= (r_r2 + r_r4)/2 # Calculating left eye center le_center= int(x_minimum + le_x), int(y_minimum + le_y) # Calculating right eye center re_center= int(x_minimum + re_x), int(y_minimum + re_y) # Now put both eyes center in a list eyes_center = [le_center, re_center ] # Extracting left eye x and y coordinates from eyes_center le_center_x = int(eyes_center[0][0]) le_center_y = int(eyes_center[0][1]) # Extracting right eye x and y coordinates from eyes_center re_center_x = int(eyes_center[1][0]) re_center_y = int(eyes_center[1][1]) # Extracting x and y (first and second) value from gaze_vector g_x, g_y = gaze_vector[0:2] # With the help of above parameters, draw arrowed lines for gaze on left and right eyes cv2.arrowedLine(preview_frame, (le_center_x, le_center_y), (le_center_x + int(g_x * 100), le_center_y + int(-g_y * 100)), (0,50,160), 1) cv2.arrowedLine(preview_frame, (re_center_x, re_center_y), (re_center_x + int(g_x * 100), re_center_y + int(-g_y * 100)), (0,50,160), 1) cv2.imshow("visualization",cv2.resize(preview_frame,(450,450))) if frame_count%5==0: mc.move(new_mouse_coord[0],new_mouse_coord[1]) if key==27: break log.info("VideoStream has been ended") cv2.destroyAllWindows() inputFeeder.close() # Calculating Inference time and frame per seconds total_time = time.time() - start_time total_inference_time=total_time fps=frame_count/total_inference_time print("Inference time: {:.3f}".format(total_inference_time)) print("FPS: {}".format(fps))
def model_pipelines(args): # Parameters which were parsed are assigned #device = args.dev #customLayers = args.lay inputFile = args.inp visual_flag = args.vf faceDetectionModel = args.mfd landmarksDetectionModel = args.mld headPoseEstimationModel = args.mhp gazeDetectionModel = args.mgd start_time = time.time() # Logging is enabled log = logging.getLogger(__name__) log.info('----------THE BEGINNING----------') log.info('Start Time: {0}'. format(str(start_time))) # The feed is initialised single_image = ['jpg','tif','png','jpeg', 'bmp'] if inputFile.split(".")[-1].lower() in single_image: input_feed = InputFeeder('image', inputFile) elif args.inp == 'cam': input_feed = InputFeeder('cam') else: input_feed = InputFeeder('video', inputFile) # Feed data is loaded log.info('Loading data...') input_feed.load_data() log.info('Data Loaded. Beginning inference...') # The models are initialised and loaded here face_model_load_start_time = time.time() landmark_model_load_start_time = time.time() headpose_model_load_start_time = time.time() gaze_model_load_start_time = time.time() ppl_fd = Face_Detection(faceDetectionModel) ppl_fl = Facial_Landmarks_Detection(landmarksDetectionModel) ppl_hd = Head_Pose_Estimation(headPoseEstimationModel) ppl_ge = Gaze_Estimation(gazeDetectionModel) face_model_load_time = time.time() - face_model_load_start_time landmark_model_load_time = time.time() - landmark_model_load_start_time headpose_model_load_time = time.time() - headpose_model_load_start_time gaze_model_load_time = time.time() - gaze_model_load_start_time log.info('Face Detection object initialized') log.info('Facial Landmarks object initialized') log.info('Head Pose object initialized') log.info('Gaze object initialized') log.info('All models loaded and checked') load_time = [face_model_load_time, landmark_model_load_time, headpose_model_load_time, gaze_model_load_time] # count the number of frames frameCount = 0 # collate frames from the feeder and feed into the detection pipelines for _, frame in input_feed.next_batch(): if not _: break frameCount += 1 if frameCount % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(100) # Get the time for the model inference face_inference_start_time = time.time() face_crop = ppl_fd.predict(frame) face_inference_time = time.time() - face_inference_start_time if 'mfd' in visual_flag: cv2.imshow('The cropped face', face_crop) if type(face_crop) == int: log.info("No face can be detected") if key == 27: break continue # Get the time for the model inference landmark_inference_start_time = time.time() eye_image_left, eye_image_right, face_landmarked = ppl_fl.predict(face_crop.copy()) landmark_inference_time = time.time() - landmark_inference_start_time # Get face landmark results if 'mld' in visual_flag: cv2.imshow('Face output', face_landmarked) if eye_image_left.any() == None or eye_image_right.any() == None: log.info("Landmarks could not be detected, check that the eyes are visible and the image is bright") continue # Get the time for the model inference headpose_inference_start_time = time.time() head_pose_angles, head_pose_image = ppl_hd.predict(face_crop.copy()) headpose_inference_time = time.time() - headpose_inference_start_time # Get head pose results if 'mhp' in visual_flag: cv2.imshow('Head Pose Angles', head_pose_image) # Get the time for the model inference gaze_inference_start_time = time.time() coord_x, coord_y = ppl_ge.predict(eye_image_left ,eye_image_right, head_pose_angles) gaze_inference_time = time.time() - gaze_inference_start_time # Get gaze detection results if 'mgd' in visual_flag: cv2.putText(face_landmarked, "Estimated x:{:.2f} | Estimated y:{:.2f}".format(coord_x, coord_y), (10,20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0,255,0),1) cv2.imshow('Gaze Estimation', face_landmarked) mCoord = MouseController('medium','fast') # Move the mouse based on the coordinates received if frameCount % 5 == 0: mCoord.move(coord_x, coord_y) if key == 27: break inference_time = [face_inference_time, landmark_inference_time, headpose_inference_time, gaze_inference_time] results(args, inference_time, load_time) if key == ord('x'): log.warning('KeyboardInterrupt: `X` was pressed') results(args, inference_time, load_time) sys.exit() log.info('End Time: {0}'. format(str(time.time() - start_time))) log.info('----------THE END----------') cv2.destroyAllWindows() input_feed.close()
def main(): # Grab command line args args = build_argparser().parse_args() logger = logging.getLogger() inputFilePath = args.input inputFeeder = None inference_time = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) #else: # if not os.path.isfile(inputFilePath): # logger.error("Unable to find specified image file") # exit(1) # inputFeeder = InputFeeder("image",inputFilePath) # Initialize variables with the input arguments modelPathDict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarksDetectionModel': args.FacialLandmarksDetectionModel, 'GazeEstimationModel': args.gazeEstimationModel, 'HeadPoseEstimationModel': args.HeadPoseEstimationModel } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension) flm = FacialLandmarksDetectionModel( modelPathDict['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'], args.device, args.cpu_extension) hpe = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'], args.device, args.cpu_extension) mc = MouseController('high', 'fast') inputFeeder.load_data() # Load Models and generate load times start_time = time.time() fdm.load_model() logger.error("Face detection model loaded: time: {:.3f} ms".format( (time.time() - start_time) * 1000)) first_mark = time.time() flm.load_model() logger.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format( (time.time() - first_mark) * 1000)) second_mark = time.time() hpe.load_model() logger.error("Head pose estimation model loaded: time: {:.3f} ms".format( (time.time() - second_mark) * 1000)) third_mark = time.time() gem.load_model() logger.error("Gaze estimation model loaded: time: {:.3f} ms".format( (time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger.error("Required models have been loaded..") frame_count = 0 start_inf_time = time.time() for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (600, 800))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Unable to detect the face.") if key == 27: break continue if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) hp_out = hpe.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = flm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) inference_time = round(time.time() - start_inf_time, 1) total_frames = int(frame_count) fps = int(frame_count) / (inference_time) logger.error("count {} seconds".format(frame_count)) logger.error("total inference time {} seconds".format(inference_time)) logger.error("total frames {} frames".format(frame_count)) logger.error("fps {} frame/second".format(fps)) with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'RunReport.txt'), 'w') as R: R.write('Load Time: ' + str(load_total_time) + '\n') R.write('Inference Time :' + str(inference_time) + '\n') R.write('total frames processed' + str(total_frames) + '\n') R.write('fps: ' + str(fps) + '\n') logger.error("VideoStream ended...") cv2.destroyAllWindows() inputFeeder.close() atexit.register(profile.print_stats)
facial_landmark_model.load_model() facial_landmark_time = (time.time() - facial_landmark_start) * 1000 gaze_model_start = time.time() gaze_estimation_model.load_model() gaze_model_time = (time.time() - gaze_model_start) * 1000 total_loading_time = (time.time() - start_time) * 1000 face_model.check_model() head_pose_model.check_model() facial_landmark_model.check_model() gaze_estimation_model.check_model() if input_file.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_file): logger.error("Unable to find video file for input") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_file) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) writer = None green_color = (0, 255, 0) blue_color = (255, 0, 0) red_color = (0, 0, 255)
def test_run(args): logging.getLogger().setLevel(logging.INFO) feeder = None activate_frame_count = 10 logging.warning("Running default value activate frame count = 10") if args.input_type == 'video' or args.input_type == 'image': feeder = InputFeeder(args.input_type, args.input) if args.input == '../bin/demo.mp4': logging.warning("Running default setting and input") elif args.input_type == 'webcam': feeder = InputFeeder(args.input_type, args.input) else: logging.error("Input not found") exit(1) mouse_controller = MouseController(args.precision, args.speed) feeder.load_data() start_time = 0 face_model_load_time = 0 start_time = time.time() face_model = FaceDetection(args.face, args.device, args.cpu_extension) face_model.load_model() face_model_load_time = time.time() - start_time logging.info("Face Detection Model Loaded...") head_pose_estimation_load_time = 0 start_time = time.time() head_pose_estimation = HeadPoseEstimation(args.headpose, args.device, args.cpu_extension) head_pose_estimation.load_model() head_pose_estimation_load_time = time.time() - start_time logging.info("Head Pose Detection Model Loaded...") facial_landmarks_detection_load_time = 0 start_time = time.time() facial_landmarks_detection = FacialLandmarksDetection( args.landmarks, args.device, args.cpu_extension) facial_landmarks_detection.load_model() facial_landmarks_detection_load_time = time.time() - start_time logging.info("Facial Landmark Detection Model Loaded...") gaze_model_load_time = 0 start_time = time.time() gaze_model = GazeEstimation(args.gazeestimation, args.device, args.cpu_extension) gaze_model.load_model() gaze_model_load_time = time.time() - start_time logging.info("Gaze Estimation Model Loaded...") frame_count = 0 total_face_model_inference_time = 0 total_head_pose_estimation_inference_time = 0 total_facial_landmarks_detection_inference_time = 0 total_gaze_model_inference_time = 0 start_time = 0 for frame in feeder.next_batch(): if frame is None: break frame_count += 1 key = cv2.waitKey(60) start_time = time.time() first_face_box, first_face = face_model.predict(frame.copy()) total_face_model_inference_time = total_face_model_inference_time + ( time.time() - start_time) start_time = time.time() head_pose_output = head_pose_estimation.predict(first_face_box.copy()) total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + ( time.time() - start_time) start_time = time.time() left_eye, right_eye, eye_coords = facial_landmarks_detection.predict( first_face_box.copy()) total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + ( time.time() - start_time) start_time = time.time() move_to_coors_mouse = gaze_model.predict(left_eye, right_eye, head_pose_output) total_gaze_model_inference_time = total_gaze_model_inference_time + ( time.time() - start_time) if frame_count % activate_frame_count == 0 and (args.flag == "3" or args.flag == "4"): mouse_controller.move(move_to_coors_mouse[0], move_to_coors_mouse[1]) cv2.imshow('video', frame) key = cv2.waitKey(60) if key == 27: break if args.flag == "1": cv2.rectangle(frame, (first_face[0], first_face[1]), (first_face[2], first_face[3]), (255, 0, 0)) cv2.imshow('video', frame) key = cv2.waitKey(60) elif args.flag == "2": cv2.rectangle(facial_landmarks_detection.image, (eye_coords[0], eye_coords[1]), (eye_coords[2], eye_coords[3]), (255, 0, 0)) cv2.imshow('video', facial_landmarks_detection.image) key = cv2.waitKey(60) elif args.flag == "3": if frame_count == 1: logging.info("Printing mouse coors: ") logging.info(move_to_coors_mouse) #Print Report if args.flag == "0": print('------------- BEGIN REPORT -------------') avg_inference_face_model = total_face_model_inference_time / frame_count avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count avg_inference_gaze_model = total_gaze_model_inference_time / frame_count print("Face Detection Model Load Time: ", args.face) print("Loading time: ", face_model_load_time) print("Inference time: ", avg_inference_face_model) print("Head Pose Detection Model: ", args.headpose) print("Loading time: ", head_pose_estimation_load_time) print("Inference time:", avg_inference_headpose) print("Facial Landmark Detection Model Load Time: ", args.landmarks) print("Loading time: ", facial_landmarks_detection_load_time) print("Inference time:", avg_inference_facial_landmark) print("Gaze Estimation Model Load Time: ", args.gazeestimation) print("Loading time: ", gaze_model_load_time) print("Inference time:", avg_inference_gaze_model) print('------------- END REPORT -------------')
class Application: def __init__(self): self.args = None self.feed = None self.face_detection_model = None self.facial_landmark_detection_model = None self.gaze_estimation_model = None self.head_pose_estimation_model = None self.frame = None self.width = None self.Height = None self.mc = MouseController("high", "fast") self.face_detection_load_time = 0 self.facial_landmark_detection_load_time = 0 self.gaze_estimation_load_time = 0 self.head_pose_estimation_load_time = 0 self.face_detection_infer_time = 0 self.facial_landmark_detection_infer_time = 0 self.gaze_estimation_infer_time = 0 self.head_pose_estimation_infer_time = 0 self.frames = 0 def initialize_argparser(self): """ Parse command line arguments. :return: command line arguments """ parser = ArgumentParser() parser.add_argument("-t", "--input-type", required=True, type=str, help="Type of input (video or cam)") parser.add_argument("-i", "--input", required=True, type=str, help="Input file") parser.add_argument("-o", "--out", type=str, default=None, help="Output file with the processed content") parser.add_argument("-p", "--preview", action='store_true', default=False, help="Should preview face and eyes") parser.add_argument("--notmove", action='store_true', default=False, help="Should not move mouse") parser.add_argument( "-m", "--model", type=str, default="FP32", help="Model precision to use. One of FP32, FP16 or FP16-INT8") parser.add_argument( "-d", "--device", type=str, default="CPU", help="Device used to process model. One or CPU or GPU") parser.add_argument("-v", "--verbose", action='store_true', default=False, help="Enable DEBUG messages") self.args = parser.parse_args() def initialize_logging(self): if self.args.verbose: log.basicConfig(level=log.DEBUG) else: log.basicConfig(level=log.ERROR) def initialize_feed(self): self.feed = InputFeeder(self.args.input_type, self.args.input) self.feed.load_data() def initialize_window(self): if self.args.preview: cv2.namedWindow('preview') cv2.namedWindow('face') cv2.namedWindow('left eye') cv2.namedWindow('right eye') cv2.namedWindow('gaze') def show_main_frame(self): cv2.imshow('preview', self.frame) def esc_key_pressed(self): key_pressed = cv2.waitKey(1) if key_pressed == 27: return True def infer_face(self): start = time.time() face_frame = self.face_detection_model.predict(self.frame) self.face_detection_infer_time += time.time() - start return face_frame def infer_eyes(self, face_frame, show=False): start = time.time() left_eye_pos, right_eye_pos, left_eye, right_eye = self.facial_landmark_detection_model.predict( face_frame) self.facial_landmark_detection_infer_time += time.time() - start if show: tmp_face = face_frame.copy() cv2.circle(tmp_face, (left_eye_pos[0], left_eye_pos[1]), 5, (0, 255, 0)) cv2.circle(tmp_face, (right_eye_pos[0], right_eye_pos[1]), 5, (0, 255, 0)) cv2.imshow('face', tmp_face) cv2.imshow('left eye', left_eye) cv2.imshow('right eye', right_eye) return left_eye, right_eye def infer_pose(self, face_frame, show=False): start = time.time() yaw, pitch, roll = self.head_pose_estimation_model.predict(face_frame) self.head_pose_estimation_infer_time += time.time() - start return yaw, pitch, roll def infer_gaze(self, cropped_left_eye, cropped_right_eye, yaw, pitch, roll, show=False): start = time.time() gaze = self.gaze_estimation_model.predict(cropped_left_eye, cropped_right_eye, yaw, pitch, roll) self.gaze_estimation_infer_time += time.time() - start if show: img = np.zeros([100, 100, 3], dtype=np.uint8) img.fill(255) cv2.circle(img, (50, 50), 50, (0, 255, 0)) cv2.arrowedLine(img, (50, 50), (50 + int(gaze[0] * 70), 50 + int(-gaze[1] * 70)), (255, 0, 0), 2) cv2.imshow('gaze', img) return gaze def infer_frame(self): self.show_main_frame() if self.esc_key_pressed(): return False self.frames += 1 face_frame = self.infer_face() if face_frame is not None: cropped_left_eye, cropped_right_eye = self.infer_eyes( face_frame, self.args.preview) yaw, pitch, roll = self.infer_pose(face_frame, self.args.preview) gaze = self.infer_gaze(cropped_left_eye, cropped_right_eye, yaw, pitch, roll, self.args.preview) if not self.args.notmove: self.mc.move(gaze[0], gaze[1]) def process_feed(self): try: for batch in self.feed.next_batch(): self.frame = batch if batch is not None: if self.infer_frame() is False: break else: break log.info("Face detection model load time: {:.2f}ms".format( 1000 * self.face_detection_infer_time)) log.info( "Facial landmark detection model load time: {:.2f}ms".format( 1000 * self.facial_landmark_detection_infer_time)) log.info("Head Pose estimation model load: {:.2f}ms".format( 1000 * self.head_pose_estimation_infer_time)) log.info("Gaze estimation model load time: {:.2f}ms".format( 1000 * self.gaze_estimation_infer_time)) log.info( "Face detection model inference mean time: {:.2f}ms".format( 1000 * self.face_detection_infer_time / self.frames)) log.info( "Facial landmark detection model inference mean time: {:.2f}ms" .format(1000 * self.facial_landmark_detection_infer_time / self.frames)) log.info( "Head Pose estimation model inference mean time: {:.2f}ms". format(1000 * self.head_pose_estimation_infer_time / self.frames)) log.info( "Gaze estimation model inference mean time: {:.2f}ms".format( 1000 * self.gaze_estimation_infer_time / self.frames)) except Exception as err: log.error("Could not infer. Cause: ", str(err)) def initialize_models(self): try: model_precision = self.args.model.upper() self.face_detection_model = Model_Face_Detection( "models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001" ) start = time.time() self.face_detection_model.load_model() self.face_detection_load_time = time.time() - start self.facial_landmark_detection_model = Model_Facial_Landmark_Detection( f"models/intel/landmarks-regression-retail-0009/{model_precision}/landmarks-regression-retail-0009", self.args.device.upper()) start = time.time() self.facial_landmark_detection_model.load_model() self.facial_landmark_detection_load_time = time.time() - start self.head_pose_estimation_model = Model_Head_Pose_estimation( f"models/intel/head-pose-estimation-adas-0001/{model_precision}/head-pose-estimation-adas-0001", self.args.device.upper()) start = time.time() self.head_pose_estimation_model.load_model() self.head_pose_estimation_load_time = time.time() - start self.gaze_estimation_model = Model_Gaze_Estimation( f"models/intel/gaze-estimation-adas-0002/{model_precision}/gaze-estimation-adas-0002", self.args.device.upper()) start = time.time() self.gaze_estimation_model.load_model() self.gaze_estimation_load_time = time.time() - start except Exception as err: log.error("Could not load model. Cause: ", str(err)) def run(self): self.initialize_argparser() self.initialize_logging() self.initialize_models() self.initialize_feed() self.initialize_window() self.process_feed() self.feed.close()
def main(): args = build_argparser().parse_args() input_file = args.input logger = log.getLogger() if input_file == "CAM": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file): logger.error("Path should be file") exit(1) input_feeder = InputFeeder("video", input_file) face_detector = FaceDetector( args.face_detection_model, device=args.device, threshold=args.threshold, extensions=args.extensions, ) face_landmark_detector = FaceLandmarkDetector( args.face_landmark_model, device=args.device, threshold=args.threshold, extensions=args.extensions, ) head_pose_estimator = HeadPoseEstimator( args.head_pose_model, device=args.device, threshold=args.threshold, extensions=args.extensions, ) gaze_estimator = GazeEstimator( args.gaze_estimation_model, device=args.device, threshold=args.threshold, extensions=args.extensions, ) mouse_controller = MouseController("medium", "fast") face_detector.load_model() face_landmark_detector.load_model() head_pose_estimator.load_model() gaze_estimator.load_model() input_feeder.load_data() width = 1000 height = int(width * 9 / 16) for flag, frame in input_feeder.next_batch(): if not flag: break pressed_key = cv2.waitKey(60) face_detected = face_detector.predict(frame) if face_detected: face_coordinates, face_image = face_detected if not face_coordinates: continue else: continue if "fd" in args.visualization: cv2.rectangle( frame, (face_coordinates[0], face_coordinates[1]), (face_coordinates[2], face_coordinates[3]), (36, 255, 12), 2, ) cv2.putText( frame, "Face Detected", (face_coordinates[0], face_coordinates[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2, ) left_eye_img, righ_eye_img, eye_coords = face_landmark_detector.predict( face_image ) if "fl" in args.visualization: frame_eye_coords_min = ( np.array(eye_coords)[:, :2] + np.array(face_coordinates)[:2] ) frame_eye_coords_max = ( np.array(eye_coords)[:, 2:] + np.array(face_coordinates)[:2] ) cv2.rectangle( frame, (frame_eye_coords_min[0][0], frame_eye_coords_min[0][1]), (frame_eye_coords_max[0][0], frame_eye_coords_max[0][1]), (36, 255, 12), 2, ) cv2.rectangle( frame, (frame_eye_coords_min[1][0], frame_eye_coords_min[1][1]), (frame_eye_coords_max[1][0], frame_eye_coords_max[1][1]), (36, 255, 12), 2, ) head_pose_estimate = head_pose_estimator.predict(face_image) if "hp" in args.visualization: cv2.putText( frame, "yaw:{:.1f}|pitch:{:.1f}|roll:{:.1f}".format(*head_pose_estimate), (20, 35), cv2.FONT_HERSHEY_COMPLEX, 1.2, (36, 255, 12), 3, ) mouse_coordinate, gaze_vector = gaze_estimator.predict( left_eye_img, righ_eye_img, head_pose_estimate ) if "ge" in args.visualization: head_pose_estimate = np.array(head_pose_estimate) yaw, pitch, roll = head_pose_estimate * np.pi / 180.0 focal_length = 950 scale = 100 origin = ( int( face_coordinates[0] + (face_coordinates[2] - face_coordinates[0]) / 2 ), int( face_coordinates[1] + (face_coordinates[3] - face_coordinates[1]) / 2 ), ) r_x = np.array( [ [1, 0, 0], [0, math.cos(pitch), -math.sin(pitch)], [0, math.sin(pitch), math.cos(pitch)], ] ) r_y = np.array( [ [math.cos(yaw), 0, -math.sin(yaw)], [0, 1, 0], [math.sin(yaw), 0, math.cos(yaw)], ] ) r_z = np.array( [ [math.cos(roll), -math.sin(roll), 0], [math.sin(roll), math.cos(roll), 0], [0, 0, 1], ] ) r = r_z @ r_y @ r_x zaxis = np.array(([0, 0, -1 * scale]), dtype="float32") offset = np.array(([0, 0, focal_length]), dtype="float32") zaxis = np.dot(r, zaxis) + offset tip = ( int(zaxis[0] / zaxis[2] * focal_length) + origin[0], int(zaxis[1] / zaxis[2] * focal_length) + origin[1], ) cv2.arrowedLine(frame, origin, tip, (0, 0, 255), 3, tipLength=0.3) cv2.imshow("frame", cv2.resize(frame, (width, height))) mouse_controller.move(mouse_coordinate[0], mouse_coordinate[1]) if pressed_key == 27: logger.error("exit key is pressed..") break
def main(args): device = args.device video_file = args.video input_type = args.input_type toggle = args.toggle stats = args.stats model = args.model if stats == 'true': stats = True else: stats = False if toggle == 'true': toggle = True else: toggle = False # Start Model Loading start_model_load_time = time.time() print(f'[INFO] Started Model Loading...........') face_model = FaceDetection(parse_models_file( label='face_detection', path=model), device) face_model.load_model() # Load Landmark model landmark_model = LandMarksDetection( parse_models_file(label='facial_landmarks_detection', path=model), device) landmark_model.load_model() pose_estimation_model = HeadPoseEstimation( parse_models_file(label='head_pose_estimation', path=model), device) pose_estimation_model.load_model() gaze_estimation_model = GazeEstimation( parse_models_file(label='gaze_estimation', path=model), device) gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time print('[TOTAL] Loaded in {:.3f} ms'.format(total_model_load_time)) # End Model Loading mouse = MouseController('high', 'fast') if not toggle: cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_AUTOSIZE) try: feed = InputFeeder(input_type=input_type, input_file=video_file) feed.load_data() initial_w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) counter = 0 if not toggle: cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_NORMAL) for frame, _ in feed.next_batch(): if not _: break try: counter += 1 # Start Inferences coord = face_model.predict(frame, (initial_w, initial_h)) for i in range(len(coord)): xmin, ymin, xmax, ymax = coord[i] cropped_image = frame[ymin:ymax, xmin:xmax] # Landmark Inference cropped_left, cropped_right = landmark_model.predict(cropped_image) if cropped_right.shape[0] < 60 or cropped_left.shape[1] < 60: break if cropped_right.shape[1] < 60 or cropped_left.shape[0] < 60: break # Pose Estimation Inference poses = pose_estimation_model.predict(cropped_image) # Gaze Estimation Inference gz = gaze_estimation_model.predict(poses, cropped_left, cropped_right) # Mouse Controller mouse.move(gz[0][0], gz[0][1]) # If user pass statistics argument to true if stats: # Print performance performance_counts( face_model.performance_counter(0) ) performance_counts( pose_estimation_model.performance_counter(0) ) performance_counts( landmark_model.performance_counter(0) ) performance_counts( gaze_estimation_model.performance_counter(0) ) if not toggle: # Output Camera or Video #cv2.resizeWindow(MAIN_WINDOW_NAME, 480, 320) cv2.imshow(MAIN_WINDOW_NAME, frame) else: # Print Statistics only no camera or video performance_counts( face_model.performance_counter(0) ) performance_counts( pose_estimation_model.performance_counter(0) ) performance_counts( landmark_model.performance_counter(0) ) performance_counts( gaze_estimation_model.performance_counter(0) ) cv2.waitKey(1) except Exception as e: print('Could not run Inference', e) feed.close() except Exception as e: print("Could not run Inference: ", e)
default=[], help="Optional model visualization flags." "fd = Face Detection, fld = Facial Landmark Detection, hp for Head Pose Estimation, ge for Gaze Estimation" "Flags should be separated by space." ) return parser args = build_argparser().parse_args() visualizationFlags = args.visualizationFlags inputFilePath = args.input inputFeeder = None if inputFilePath.lower()=="cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): print("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video",inputFilePath) modelPathDict = {'FaceDetectionModel':args.facedetectionmodel, 'FacialLandmarksDetectionModel':args.faciallandmarkmodel, 'GazeEstimationModel':args.gazeestimationmodel, 'HeadPoseEstimationModel':args.headposemodel} for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): print("Unable to find specified "+fileNameKey+" xml file") exit(1) face_detection = Model_FaceDetection(modelPathDict['FaceDetectionModel'], args.device, args.cpu_extension)
def main(): args = build_argparser().parse_args() logger = logging.getLogger() if args.input_type == 'video' or args.input_type == 'image': extension = str(args.input).split('.')[1] feeder = InputFeeder(args.input_type, args.input) elif args.input_type == 'cam': feeder = InputFeeder(args.input_type) mc = MouseController("medium", "fast") feeder.load_data() face_model = FaceDetectionModel(args.facedetectionmodel, args.device, args.cpu_extension) face_model.check_model() landmark_model = Landmark_Model(args.facelandmarkmodel, args.device, args.cpu_extension) landmark_model.check_model() gaze_model = Gaze_Estimation_Model(args.gazeestimationmodel, args.device, args.cpu_extension) gaze_model.check_model() head_model = Head_Pose_Model(args.headposemodel, args.device, args.cpu_extension) head_model.check_model() face_model.load_model() logger.info("Face Detection Model Loaded...") landmark_model.load_model() logger.info("Landmark Detection Model Loaded...") head_model.load_model() logger.info("Head Pose Detection Model Loaded...") gaze_model.load_model() logger.info("Gaze Estimation Model Loaded...") logger.info('All Models are loaded\n\n') out = cv2.VideoWriter('output_video.mp4', 0x00000021, 30, (500, 500)) frame_count = 0 for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) faceROI = None if True: faceROI, box = FaceDetectionModel.predict(frame.copy(), args.prob_threshold) if faceROI is None: logger.error("Unable to detect the face.") if key == 27: break continue (lefteye_x, lefteye_y), ( righteye_x, righteye_y ), eye_coords, left_eye, right_eye = FaceLandmarkModel.predict( faceROI.copy(), EYE_ROI=10) head_position = HeadPoseModel.predict(faceROI.copy()) new_mouse_coord, gaze_vector = EyeGazeModel.predict( left_eye.copy(), right_eye.copy(), head_position) if (not len(previewFlags) == 0): preview_frame = frame.copy() if 'fd' in previewFlags: #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3) preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle( croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle( croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace if 'hp' in previewFlags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace #cv2.imshow("visualization",cv2.resize(preview_frame,(500,500))) out.write(frame) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") out.release() cv2.destroyAllWindows() inputFeeder.close()
def main(args): mouse_controller = MouseController('medium', 'fast') device = args.device extension = args.cpu_extension input_path = args.input prob = args.prob_threshold if input_path.lower() == "cam": input_image = InputFeeder("cam") else: if os.path.isfile(input_path): input_image = InputFeeder("video", input_path) else: print("Invalid path to file used: {}".format(input_path)) exit(1) fd_model = Model_Face_Detection() fl_model = Model_Landmark() g_model = Model_Gaze() p_model = Model_Head_Pose() fd_model.load_model(args.face_detection, extension, device) fl_model.load_model(args.face_landmark, extension, device) g_model.load_model(args.gaze_detection, extension, device) p_model.load_model(args.pose_detection, extension, device) input_image.load_data() frame_count = 0 for flag, frame in input_image.next_batch(): if not flag: break frame_count += 1 pressed_key = cv2.waitKey(60) # Get image crop of image from face detection fd_coords = fd_model.predict(frame, prob) if len(fd_coords) == 0: print("No face found...") if pressed_key == 27: break else: continue # Get first face available fd_coords = fd_coords[0] # Crop image [ymin:ymax, xmin:xmax] cropped_image = frame[fd_coords[1]:fd_coords[3], fd_coords[0]:fd_coords[2]] yaw, pitch, roll = p_model.predict(cropped_image) left_eye, right_eye = fl_model.predict(cropped_image) left_eye_img = cropped_image[left_eye[1]:left_eye[3], left_eye[0]:left_eye[2]] right_eye_img = cropped_image[right_eye[1]:right_eye[3], right_eye[0]:right_eye[2]] if left_eye_img.shape != (20, 20, 3) and right_eye_img.shape != (20, 20, 3): print("Could not find eyes...") continue if left_eye_img.shape != (20, 20, 3): print("Could not find left eye..") left_eye_img = right_eye_img elif right_eye_img.shape != (20, 20, 3): print("Could not find right eye..") right_eye_img = left_eye_img #Estimate gaze mouse_x, mouse_y = g_model.predict(left_eye_img, right_eye_img, [yaw, pitch, roll]) if args.visual: # Face Outline cv2.rectangle(frame, (fd_coords[0], fd_coords[1]), (fd_coords[2], fd_coords[3]), (0, 255, 100)) # Eye Outlines size = 20 left_cornerx = left_eye[0] + fd_coords[0] left_cornery = left_eye[1] + fd_coords[1] left_eye = [ left_cornerx, left_cornery, left_cornerx + size, left_cornery + size ] right_cornerx = right_eye[0] + fd_coords[0] right_cornery = right_eye[1] + fd_coords[1] right_eye = [ right_cornerx, right_cornery, right_cornerx + size, right_cornery + size ] cv2.rectangle(frame, (left_eye[0], left_eye[1]), (left_eye[2], left_eye[3]), (0, 10, 200), thickness=4) cv2.rectangle(frame, (right_eye[0], right_eye[1]), (right_eye[2], right_eye[3]), (0, 10, 200), thickness=4) cv2.imshow("Image", frame) # Perfomance Dependacy if frame_count % 5 == 0 and args.no_move: mouse_controller.move(mouse_x, mouse_y)
def main(args): ## loading models try: input_file = args.input mode_visualization = args.mode_visualization if input_file == "CAM": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file): log.error("ERROR: INPUT PATH IS NOT VALID") exit(1) input_feeder = InputFeeder("video", input_file) face_detection_class = Face_Detection( model=args.face_detection, device=args.device, extensions=args.cpu_extension) face_landmarks_class = Landmarks_Detection( model=args.face_landmark, device=args.device, extensions=args.cpu_extension) head_pose_class = Head_Pose(model=args.head_pose, device=args.device, extensions=args.cpu_extension) gaze_estimation_class = Gaze_Estimation( model=args.gaze_estimation, device=args.device, extensions=args.cpu_extension) mouse_control = MouseController('medium', 'fast') start_time = time.time() ## Load the models one by one and all necessary info face_det_time = time.time() face_detection_class.load_model() print("Face Detection Load Time: time: {:.3f} ms".format( (time.time() - face_det_time) * 1000)) face_land_time = time.time() face_landmarks_class.load_model() print("Facial landmarks load Time: time: {:.3f} ms".format( (time.time() - face_land_time) * 1000)) head_po_time = time.time() head_pose_class.load_model() print("Head pose load time: time: {:.3f} ms".format( (time.time() - head_po_time) * 1000)) gaze_est_time = time.time() gaze_estimation_class.load_model() print("Gaze estimation load time: time: {:.3f} ms".format( (time.time() - gaze_est_time) * 1000)) total_time = time.time() - start_time print("Total loading time taken: time: {:.3f} ms".format( total_time * 1000)) print("All models are loaded successfully..") input_feeder.load_data() print("Feeder is loaded") except: print('Error occured on loading models in app') ## performing inferences try: start_inference_time = time.time() frame_count = 0 for flag, frame in input_feeder.next_batch(): if not flag: break frame_count += 1 if frame_count == 0: cv2.imshow('video', cv2.resize(frame, (700, 700))) key = cv2.waitKey(60) crop_face, face_coords = face_detection_class.predict( frame.copy(), args.conf_threshold) if type(crop_face) == int: log.error("Unable to detect the face.") if key == 27: break continue ## perform inference head_angle = head_pose_class.predict(crop_face.copy()) left_eye, right_eye, eye_coords = face_landmarks_class.predict( crop_face.copy()) mouse_position, gaze_vector = gaze_estimation_class.predict( left_eye, right_eye, head_angle) ## checking for extra flags if (not len(mode_visualization) == 0): p_frame = frame.copy() if ('fd' in mode_visualization): p_frame = crop_face if ('fl' in mode_visualization): cv2.rectangle( crop_face, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 1) cv2.rectangle( crop_face, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), ( 0, 255, 0, ), 1) if ('hp' in mode_visualization): cv2.putText( p_frame, "Head Positions: :{:.2f} :{:.2f} :{:.2f}".format( head_angle[0], head_angle[1], head_angle[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if ('ge' in mode_visualization): i, j, k = int(gaze_vector[0] * 12), int( gaze_vector[1] * 12), 160 l_eye = cv2.line(left_eye.copy(), (i - k, j - k), (i + k, j + k), (0, 255, 255), 2) cv2.line(l_eye, (i - k, j + k), (i + k, j - k), (255, 0, 255), 2) r_eye = cv2.line(right_eye.copy(), (i - k, j - k), (i + k, j + k), (0, 255, 255), 2) cv2.line(r_eye, (i - k, j + k), (i + k, j - k), (0, 255, 255), 2) l_eye = crop_face[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] r_eye = crop_face[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] cv2.imshow("visual for client", cv2.resize(p_frame, (700, 700))) if frame_count % 1 == 0: mouse_control.move(mouse_position[0], mouse_position[1]) if key == 27: break ## working on inference time and frames per second total_infer_time = time.time() - start_inference_time frames_per_sec = int(frame_count) / total_infer_time print("Time counter: {:.3f} seconds".format(frame_count)) print("Total inference time: {:.3f} seconds".format( total_infer_time)) print("FPs: {:.3f} fps ".format(frames_per_sec)) except: print('Error on performing inference in app file') print("All Done...") cv2.destroyAllWindows() input_feeder.close()
def main(): try: logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("Computer_Pointer_Controller.log"), logging.StreamHandler() ]) except: print("File cannot be created") args = build_argparser() video_path = args.i visualize = args.flags count = 0 fd_inference_time = 0 fld_inference_time = 0 hp_inference_time = 0 ge_inference_time = 0 MC = MouseController('medium', 'fast') logging.info("############## Model Load Time #############") start_time = time.time() first_model_time = start_time FD = Face_Detection(device=args.d, threshold=args.prob, extensions=args.l) FD.load_model(model_path=args.f) logging.info("Face Detection Model: {:.3f}ms".format( 1000 * (time.time() - first_model_time))) second_model_time = time.time() FLD = Facial_Landmarks_Detection(device=args.d, extensions=args.l) FLD.load_model(model_path=args.fl) logging.info("Facial Landmarks Detection Model: {:.3f}ms".format( 1000 * (time.time() - second_model_time))) third_model_time = time.time() HPE = Head_Pose_Estimation(device=args.d, extensions=args.l) HPE.load_model(model_path=args.hp) logging.info("Head Pose Estimation Model: {:.3f}ms".format( 1000 * (time.time() - third_model_time))) fourth_model_time = time.time() GE = Gaze_Estimation(device=args.d, extensions=args.l) GE.load_model(model_path=args.g) logging.info("Gaze Estimation Model: {:.3f}ms".format( 1000 * (time.time() - fourth_model_time))) logging.info("############## End ######################### ") Total_Model_Load_Time = 1000 * (time.time() - start_time) ##### LOADING VIDEO FILE ##### if (video_path == "cam"): IF = InputFeeder("cam") else: IF = InputFeeder("video", video_path) IF.load_data() ##### MODEL INFERENCE ##### start_inf_time = time.time() for flag, frame in IF.next_batch(): if not flag: break if (count % 5 == 0): cv2.imshow('frame', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) count = count + 1 start_time_1 = time.time() face, face_coordinates = FD.predict(frame, args.it) fd_inference_time += (time.time() - start_time_1) start_time_2 = time.time() left_eye_image, right_eye_image, eye_coordinates = FLD.predict( face, args.it) fld_inference_time += (time.time() - start_time_2) start_time_3 = time.time() head_pose_angles = HPE.predict(face, args.it) hp_inference_time += (time.time() - start_time_3) start_time_4 = time.time() mouse_coordinates, gaze_vector = GE.predict(left_eye_image, right_eye_image, head_pose_angles, args.it) ge_inference_time += (time.time() - start_time_4) if (len(visualize) != 0): frame_visualize = frame.copy() if ("fd" in visualize): if (len(visualize) == 1): cv2.rectangle(frame_visualize, (face_coordinates[0], face_coordinates[1]), (face_coordinates[2], face_coordinates[3]), (255, 0, 255), 2) else: frame_visualize = face.copy() if ("fld" in visualize): if not "fd" in visualize: frame_visualize = face.copy() cv2.circle(frame_visualize, (eye_coordinates['left_eye'][0], eye_coordinates['left_eye'][1]), 25, (0, 0, 255), 2) cv2.circle(frame_visualize, (eye_coordinates['right_eye'][0], eye_coordinates['right_eye'][1]), 25, (0, 0, 255), 2) if ("hp" in visualize): cv2.putText( frame_visualize, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.255, (0, 255, 0), 1) if ("ge" in visualize): h = face.shape[0] arrow = h * 0.7 arrow_X = gaze_vector[0] * arrow arrow_Y = -gaze_vector[1] * arrow cv2.arrowedLine( frame_visualize, (eye_coordinates['left_eye'][0], eye_coordinates['left_eye'][1]), (int(eye_coordinates['left_eye'][0] + arrow_X), int(eye_coordinates['left_eye'][1] + arrow_Y)), (255, 0, 0), 2) cv2.arrowedLine( frame_visualize, (eye_coordinates['right_eye'][0], eye_coordinates['right_eye'][1]), (int(eye_coordinates['right_eye'][0] + arrow_X), int(eye_coordinates['right_eye'][1] + arrow_Y)), (255, 0, 0), 2) if (count % 5 == 0): cv2.imshow('Visualization', cv2.resize(frame_visualize, (500, 500))) if (count % 5 == 0): MC.move(mouse_coordinates[0], mouse_coordinates[1]) if key == 27: break Total_Inference_Time = time.time() - start_inf_time if (count > 0): logging.info("############## Models Inference time #######") logging.info("Face Detection:{:.3f}ms".format( 1000 * fd_inference_time / count)) logging.info("Facial Landmarks Detection:{:.3f}ms".format( 1000 * fld_inference_time / count)) logging.info("Headpose Estimation:{:.3f}ms".format( 1000 * hp_inference_time / count)) logging.info("Gaze Estimation:{:.3f}ms".format( 1000 * ge_inference_time / count)) logging.info("############## End #########################") logging.info("############## Summarized Results ##########") logging.info( "Total Model Load Time: {:.3f}ms".format(Total_Model_Load_Time)) logging.info("Total Inference Time: {:.3f}s".format(Total_Inference_Time)) logging.info("FPS:{}".format(count / Total_Inference_Time)) logging.info("############ End ###########################") cv2.destroyAllWindows() IF.close()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ #if args.input == 'cam': # args.input = 0 output_intermediate_model = args.output_intermediate_model ### TODO: Handle the input stream ### feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) cap = feed.load_data() width = int(cap.get(3)) height = int(cap.get(4)) fps = int(cap.get(5)) # Initialise the class try: infer_network_face_detection = BasePointer() infer_network_head_pose_estimation = BasePointer() infer_network_landmarks_regression_retail = BasePointer() infer_network_gaze_estimation = GazeEstimation() except: logging.error("Error in initializing models") exit(1) ### TODO: Load the model through `infer_network_face_detection` ### try: start_loading_time_face_detection = time.time() infer_network_face_detection.load_model(args.model1, args.device) load_model_face_detection_time_taken = time.time( ) - start_loading_time_face_detection start_loading_time_head_pose_estimation = time.time() infer_network_head_pose_estimation.load_model(args.model2, args.device) load_model_head_pose_estimation_time_taken = time.time( ) - start_loading_time_head_pose_estimation start_loading_time_landmarks_regression_retail = time.time() infer_network_landmarks_regression_retail.load_model( args.model3, args.device) load_model_landmarks_regression_retail_time_taken = time.time( ) - start_loading_time_landmarks_regression_retail start_loading_time_gaze_estimation = time.time() infer_network_gaze_estimation.load_model(args.model4, args.device) load_model_gaze_estimation_time_taken = time.time( ) - start_loading_time_gaze_estimation except: logging.error("Error in loading the models") exit(1) logging.debug( "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(load_model_face_detection_time_taken, load_model_landmarks_regression_retail_time_taken, load_model_head_pose_estimation_time_taken, load_model_gaze_estimation_time_taken)) if output_intermediate_model == 'true': out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height)) total_time_taken_to_infer_inf_face_detection = 0 total_time_taken_to_infer_landmarks_regression_retail = 0 total_time_taken_to_infer_inf_head_pose_estimation = 0 total_time_taken_to_infer_gaze_estimation = 0 ### TODO: Loop until stream is over ### for batch in feed.next_batch(): ### TODO: Read from the video capture ### flag, frame = batch if not flag: break key_pressed = cv2.waitKey(60) ### TODO: Start inference for face detection ### start_inf_face_detection = time.time() outputs_face_detection = infer_network_face_detection.predict(frame) time_taken_to_infer_inf_face_detection = time.time( ) - start_inf_face_detection coords, frame = infer_network_face_detection.preprocess_output_face_detection( outputs_face_detection, width, height, args.prob_threshold, frame) if output_intermediate_model == 'true': out.write(frame) frame_crop_face = crop_face(coords, frame, output_intermediate_model) start_inf_head_pose_estimation = time.time() outputs_head_pose_estimation = infer_network_head_pose_estimation.predict( frame_crop_face) time_taken_to_infer_inf_head_pose_estimation = time.time( ) - start_inf_head_pose_estimation yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation( outputs_head_pose_estimation, frame_crop_face) head_pose_angles = [yaw, pitсh, roll] if output_intermediate_model == 'true': cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) height_crop_face = coords[0][3] - coords[0][1] width_crop_face = coords[0][2] - coords[0][0] start_inf_landmarks_regression_retail = time.time() outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict( frame_crop_face) time_taken_to_infer_landmarks_regression_retail = time.time( ) - start_inf_landmarks_regression_retail coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail( outputs_landmarks_regression_retail, width_crop_face, height_crop_face, args.prob_threshold, frame) center_left_eye = ((coords[0][0] + coord_landmarks_regression_retail[0]), coords[0][1] + coord_landmarks_regression_retail[1]) center_right_eye = ((coords[0][0] + coord_landmarks_regression_retail[2]), coords[0][1] + coord_landmarks_regression_retail[3]) xmin_left_eye = center_left_eye[0] - 30 ymin_left_eye = center_left_eye[1] - 30 xmax_left_eye = center_left_eye[0] + 30 ymax_left_eye = center_left_eye[1] + 30 xmin_right_eye = center_right_eye[0] - 30 ymin_right_eye = center_right_eye[1] - 30 xmax_right_eye = center_right_eye[0] + 30 ymax_right_eye = center_right_eye[1] + 30 frame_landmarks_regression_retail = cv2.circle(frame, center_left_eye, 2, (0, 255, 0), thickness=3) frame_landmarks_regression_retail = cv2.circle(frame, center_right_eye, 2, (0, 255, 0), thickness=3) box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye), (xmax_left_eye, ymax_left_eye), (0, 255, 0), 3) box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye), (xmax_right_eye, ymax_right_eye), (0, 255, 0), 3) if output_intermediate_model == 'true': out.write(frame_landmarks_regression_retail) ### TODO: Start inference for gaze estimation ### start_inf_gaze_estimation = time.time() outputs_gaze_estimation = infer_network_gaze_estimation.predict( box_left_eye, box_right_eye, head_pose_angles) time_taken_to_infer_gaze_estimation = time.time( ) - start_inf_gaze_estimation total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation arrow = 100 g_x = int(outputs_gaze_estimation[0] * arrow) g_y = int(-(outputs_gaze_estimation[1]) * arrow) frame = cv2.arrowedLine(frame, (center_left_eye), ((center_left_eye[0] + g_x), (center_left_eye[1] + g_y)), (0, 0, 255), 3) frame = cv2.arrowedLine(frame, (center_right_eye), ((center_right_eye[0] + g_x), (center_right_eye[1] + g_y)), (0, 0, 255), 3) if output_intermediate_model == 'true': out.write(frame) mouse_controler_pc = MouseController("high", "fast") mouse_controler_pc.move(outputs_gaze_estimation[0], outputs_gaze_estimation[1]) if key_pressed == 27: break feed.close() logging.debug( "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(total_time_taken_to_infer_inf_face_detection, total_time_taken_to_infer_landmarks_regression_retail, total_time_taken_to_infer_inf_head_pose_estimation, total_time_taken_to_infer_gaze_estimation)) if output_intermediate_model == 'true': out.release() #cap.release() cv2.destroyAllWindows()
class Inferencer: def __init__(self, device='CPU', mouse_con=False, face_dec=None, fac_land=None, head_pose=None, gaze=None, show_video=False, save_video=False): ''' all models should be put in here ''' if face_dec and fac_land and head_pose and gaze: self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel( face_dec, device=device), FacialLandmarksDetection( fac_land, device=device), Head_Pose_Estimation( head_pose, device=device), Gaze_Estimation(gaze, device=device) self.face_dec.load_model() self.fac_land.load_model() self.head_pose.load_model() self.gaze.load_model() else: raise ValueError('Missing Arguments') if mouse_con: self.mouse_con = MouseController("low", "fast") self.show_video, self.save_video = show_video, save_video def __call__( self, input_type=None, input_file=None, ): self.run(input_type=input_type, input_file=input_file) def run( self, input_type=None, input_file=None, ): if input_type and input_file: self.input_ = InputFeeder(input_type, input_file) self.input_.load_data() if self.save_video: out = cv2.VideoWriter( 'output.mp4', 0x00000021, 30, (int(self.input_.cap.get(3)), int(self.input_.cap.get(4)))) try: fc_dec_inf_time = 0 landmark_inf_time = 0 pose_inf_time = 0 gaze_inf_time = 0 frame_counter = 0 while True: # Read the next frame try: frame = next(self.input_.next_batch()) frame_counter += 1 except StopIteration: break key_pressed = cv2.waitKey(60) # face detection start = time.time() out_frame, boxes = self.face_dec.predict(frame, display_output=True) fc_dec_inf_time += (time.time() - start) #for each box for box in boxes: face = out_frame[box[1]:box[3], box[0]:box[2]] start = time.time() out_frame, left_eye_point, right_eye_point = self.fac_land.predict( out_frame, face, box, display_output=True) landmark_inf_time += (time.time() - start) start = time.time() out_frame, headpose_angels = self.head_pose.predict( out_frame, face, box, display_output=True) pose_inf_time += (time.time() - start) start = time.time() out_frame, gazevector = self.gaze.predict( out_frame, face, box, left_eye_point, right_eye_point, headpose_angels, display_output=True) gaze_inf_time += (time.time() - start) if self.show_video: cv2.imshow('im', out_frame) if self.save_video: out.write(out_frame) if self.mouse_con: self.mouse_con.move(gazevector[0], gazevector[1]) time.sleep(1) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break if self.save_video: out.release() self.input_.close() cv2.destroyAllWindows() print( 'average inference time for face detection model is :- {:2f}ms' .format((fc_dec_inf_time / frame_counter) * 1000)) print( 'average inference time for facial landmark model is :- {:2f}ms' .format((landmark_inf_time / frame_counter) * 1000)) print( 'average inference time for head pose estimation model is :- {:2f}ms' .format((pose_inf_time / frame_counter) * 1000)) print( 'average inference time for gaze estimation model is :- {:2f}ms' .format((gaze_inf_time / frame_counter) * 1000)) except Exception as ex: logging.exception("Error in inference: " + str(ex))