else: print(f"{type}{cam['name']}") cap.destroy_capture() sys.exit(0) import numpy as np import time import cv2 import socket import struct import json from input_reader import InputReader, VideoReader, DShowCaptureReader, try_int from tracker import Tracker, get_model_base_path if args.benchmark > 0: model_base_path = get_model_base_path(args.model_dir) im = cv2.imread(os.path.join(model_base_path, "benchmark.bin"), cv2.IMREAD_COLOR) results = [] for model_type in [3, 2, 1, 0, -1, -2, -3]: tracker = Tracker(224, 224, threshold=0.1, max_threads=args.max_threads, max_faces=1, discard_after=0, scan_every=0, silent=True, model_type=model_type, model_dir=args.model_dir, no_gaze=(model_type == -1), detection_threshold=0.1, use_retinaface=0, max_feature_updates=900, static_model=True if args.no_3d_adapt == 1 else False) tracker.detected = 1 tracker.faces = [(0, 0, 224, 224)] total = 0.0 for i in range(100): start = time.perf_counter() r = tracker.predict(im) total += time.perf_counter() - start print(1. / (total / 100.)) sys.exit(0) target_ip = args.ip
def process_video(video_path): output_logfile = None if args.log_output != "": output_logfile = open(args.log_output, "w") sys.stdout = OutputLog(output_logfile, sys.stdout) sys.stderr = OutputLog(output_logfile, sys.stderr) if os.name == 'nt': import dshowcapture if args.blackmagic == 1: dshowcapture.set_bm_enabled(True) if not args.blackmagic_options is None: dshowcapture.set_options(args.blackmagic_options) if not args.priority is None: import psutil classes = [psutil.IDLE_PRIORITY_CLASS, psutil.BELOW_NORMAL_PRIORITY_CLASS, psutil.NORMAL_PRIORITY_CLASS, psutil.ABOVE_NORMAL_PRIORITY_CLASS, psutil.HIGH_PRIORITY_CLASS, psutil.REALTIME_PRIORITY_CLASS] p = psutil.Process(os.getpid()) p.nice(classes[args.priority]) if os.name == 'nt' and (args.list_cameras > 0 or not args.list_dcaps is None): cap = dshowcapture.DShowCapture() info = cap.get_info() unit = 10000000.; if not args.list_dcaps is None: formats = {0: "Any", 1: "Unknown", 100: "ARGB", 101: "XRGB", 200: "I420", 201: "NV12", 202: "YV12", 203: "Y800", 300: "YVYU", 301: "YUY2", 302: "UYVY", 303: "HDYC (Unsupported)", 400: "MJPEG", 401: "H264" } for cam in info: if args.list_dcaps == -1: type = "" if cam['type'] == "Blackmagic": type = "Blackmagic: " print(f"{cam['index']}: {type}{cam['name']}") if args.list_dcaps != -1 and args.list_dcaps != cam['index']: continue for caps in cam['caps']: format = caps['format'] if caps['format'] in formats: format = formats[caps['format']] if caps['minCX'] == caps['maxCX'] and caps['minCY'] == caps['maxCY']: print(f" {caps['id']}: Resolution: {caps['minCX']}x{caps['minCY']} FPS: {unit/caps['maxInterval']:.3f}-{unit/caps['minInterval']:.3f} Format: {format}") else: print(f" {caps['id']}: Resolution: {caps['minCX']}x{caps['minCY']}-{caps['maxCX']}x{caps['maxCY']} FPS: {unit/caps['maxInterval']:.3f}-{unit/caps['minInterval']:.3f} Format: {format}") else: if args.list_cameras == 1: print("Available cameras:") for cam in info: type = "" if cam['type'] == "Blackmagic": type = "Blackmagic: " if args.list_cameras == 1: print(f"{cam['index']}: {type}{cam['name']}") else: print(f"{type}{cam['name']}") cap.destroy_capture() sys.exit(0) import numpy as np import time import cv2 import socket import struct import json from input_reader import InputReader, VideoReader, DShowCaptureReader, try_int from tracker import Tracker, get_model_base_path from tqdm import tqdm if args.benchmark > 0: model_base_path = get_model_base_path(args.model_dir) im = cv2.imread(os.path.join(model_base_path, "benchmark.bin"), cv2.IMREAD_COLOR) results = [] for model_type in [3, 2, 1, 0, -1, -2, -3]: tracker = Tracker(224, 224, threshold=0.1, max_threads=args.max_threads, max_faces=1, discard_after=0, scan_every=0, silent=True, model_type=model_type, model_dir=args.model_dir, no_gaze=(model_type == -1), detection_threshold=0.1, use_retinaface=0, max_feature_updates=900, static_model=True if args.no_3d_adapt == 1 else False) tracker.detected = 1 tracker.faces = [(0, 0, 224, 224)] total = 0.0 for i in range(100): start = time.perf_counter() r = tracker.predict(im) # r = tracker.predict(im) total += time.perf_counter() - start print(1. / (total / 100.)) sys.exit(0) target_ip = args.ip target_port = args.port if args.faces >= 40: print("Transmission of tracking data over network is not supported with 40 or more faces.") fps = 24 dcap = None use_dshowcapture_flag = False if os.name == 'nt': fps = args.fps dcap = args.dcap use_dshowcapture_flag = True if args.use_dshowcapture == 1 else False input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag, dcap=dcap) if args.dcap == -1 and type(input_reader) == DShowCaptureReader: fps = min(fps, input_reader.device.get_fps()) else: input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag) # if type(input_reader.reader) == VideoReader: # fps = 0.0 log = None out = None first = True height = 0 width = 0 tracker = None sock = None total_tracking_time = 0.0 tracking_time = 0.0 tracking_frames = 0 framecount = 0 eye_blink_frames = 0 eye_blink_lst = [] eye_blink_temp = [] COUNTER = 0 TOTAL = 0 current_frame = 1 blink_start = 0 blink_end = 0 closeness = 0 output_closeness = [] output_blinks = [] blink_info = (0,0) processed_frame = [] frame_info_list = [] lStart = 42 lEnd = 48 rStart = 36 rEnd = 42 ear_th = 0.18 consec_th = 3 up_to = None array_blink_threshold = list() ear_list = list() col=['F1',"F2","F3","F4","F5",'F6',"F7", "F8", "F9", "F10", "F11", "F12", "F13"] features = ["eye_l", "eye_r", "eyebrow_steepness_l", "eyebrow_updown_l", "eyebrow_quirk_l", "eyebrow_steepness_r", "eyebrow_updown_r", "eyebrow_quirk_r", "mouth_corner_updown_l", "mouth_corner_inout_l", "mouth_corner_updown_r", "mouth_corner_inout_r", "mouth_open", "mouth_wide"] if args.log_data != "": log = open(args.log_data, "w") log.write("Frame,Time,Width,Height,FPS,Face,FaceID,RightOpen,LeftOpen,AverageConfidence,Success3D,PnPError,RotationQuat.X,RotationQuat.Y,RotationQuat.Z,RotationQuat.W,Euler.X,Euler.Y,Euler.Z,RVec.X,RVec.Y,RVec.Z,TVec.X,TVec.Y,TVec.Z") for i in range(66): log.write(f",Landmark[{i}].X,Landmark[{i}].Y,Landmark[{i}].Confidence") for i in range(66): log.write(f",Point3D[{i}].X,Point3D[{i}].Y,Point3D[{i}].Z") for feature in features: log.write(f",{feature}") log.write("\r\n") log.flush() is_camera = video_path == str(try_int(video_path)) try: attempt = 0 frame_time = time.perf_counter() target_duration = 0 if fps > 0: target_duration = 1. / float(fps) repeat = args.repeat_video != 0 and type(input_reader.reader) == VideoReader need_reinit = 0 failures = 0 source_name = input_reader.name blink_count = 0 blink_count_origin = 0 while repeat or input_reader.is_open(): if not input_reader.is_open() or need_reinit == 1: input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag, dcap=dcap) if input_reader.name != source_name: print(f"Failed to reinitialize camera and got {input_reader.name} instead of {source_name}.") sys.exit(1) need_reinit = 2 time.sleep(0.02) continue if not input_reader.is_ready(): time.sleep(0.02) continue ret, frame = input_reader.read() fps = input_reader.get_fps() frame_count = int(input_reader.get_frame()) duration = frame_count/fps video_info_dict = { 'fps': fps, 'frame_count': frame_count, 'duration(s)': duration } # frame = cv2.flip(frame,1) #2 -50 - 0.5 -20,-50 # frame = cv2.convertScaleAbs(frame, -1, 0.5, -20) if not ret: if repeat: if need_reinit == 0: need_reinit = 1 continue elif is_camera: attempt += 1 if attempt > 30: break else: time.sleep(0.02) if attempt == 3: need_reinit = 1 continue else: break; attempt = 0 need_reinit = 0 # frame_count += 1 now = time.time() if first: first = False height, width, channels = frame.shape sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) tracker = Tracker(width, height, threshold=args.threshold, max_threads=args.max_threads, max_faces=args.faces, discard_after=args.discard_after, scan_every=args.scan_every, silent=False if args.silent == 0 else True, model_type=args.model, model_dir=args.model_dir, no_gaze=False if args.gaze_tracking != 0 and args.model != -1 else True, detection_threshold=args.detection_threshold, use_retinaface=args.scan_retinaface, max_feature_updates=args.max_feature_updates, static_model=True if args.no_3d_adapt == 1 else False, try_hard=args.try_hard == 1) if not args.video_out is None: out = cv2.VideoWriter(args.video_out, cv2.VideoWriter_fourcc('F','F','V','1'), args.video_fps, (width * args.video_scale, height * args.video_scale)) try: inference_start = time.perf_counter() faces = tracker.predict(frame) # faces = tracker.predict(frame) if len(faces) > 0: inference_time = (time.perf_counter() - inference_start) total_tracking_time += inference_time tracking_time += inference_time / len(faces) tracking_frames += 1 # else: # ear_list.append(np.nan) # array_blink_threshold.append(np.nan) packet = bytearray() detected = False r_eye_roi_resize = [] l_eye_roi_resize = [] head_list = [] for face_num, f in enumerate(faces): f = copy.copy(f) f.id += args.face_id_offset if f.eye_blink is None: f.eye_blink = [1, 1] right_state = "O" if f.eye_blink[0] > 0.30 else "-" left_state = "O" if f.eye_blink[1] > 0.30 else "-" if f.eye_blink[0] < 0.7 or f.eye_blink[1] < 0.7: eye_blink_frames += 1 if args.silent == 0: print(f"Confidence[{f.id}]: {f.conf:.4f} / 3D fitting error: {f.pnp_error:.4f} / Eyes: {left_state}, {right_state}") detected = True if not f.success: pts_3d = np.zeros((70, 3), np.float32) packet.extend(bytearray(struct.pack("d", now))) packet.extend(bytearray(struct.pack("i", f.id))) packet.extend(bytearray(struct.pack("f", width))) packet.extend(bytearray(struct.pack("f", height))) packet.extend(bytearray(struct.pack("f", f.eye_blink[0]))) packet.extend(bytearray(struct.pack("f", f.eye_blink[1]))) packet.extend(bytearray(struct.pack("B", 1 if f.success else 0))) packet.extend(bytearray(struct.pack("f", f.pnp_error))) packet.extend(bytearray(struct.pack("f", f.quaternion[0]))) packet.extend(bytearray(struct.pack("f", f.quaternion[1]))) packet.extend(bytearray(struct.pack("f", f.quaternion[2]))) packet.extend(bytearray(struct.pack("f", f.quaternion[3]))) packet.extend(bytearray(struct.pack("f", f.euler[0]))) packet.extend(bytearray(struct.pack("f", f.euler[1]))) packet.extend(bytearray(struct.pack("f", f.euler[2]))) packet.extend(bytearray(struct.pack("f", f.translation[0]))) packet.extend(bytearray(struct.pack("f", f.translation[1]))) packet.extend(bytearray(struct.pack("f", f.translation[2]))) r_eye = [[f.lms[36][1], f.lms[36][0]], [f.lms[37][1], f.lms[37][0]], [f.lms[38][1], f.lms[38][0]], [f.lms[39][1], f.lms[39][0]], [f.lms[40][1], f.lms[40][0]], [f.lms[41][1], f.lms[41][0]]] l_eye = [[f.lms[42][1], f.lms[42][0]], [f.lms[43][1], f.lms[43][0]], [f.lms[44][1], f.lms[44][0]], [f.lms[45][1], f.lms[45][0]], [f.lms[46][1], f.lms[46][0]], [f.lms[47][1], f.lms[47][0]]] l_eye_ratio = eye_aspect_ratio(l_eye) r_eye_ratio = eye_aspect_ratio(r_eye) ear = (l_eye_ratio + r_eye_ratio) / 2.0 ear_model = (f.eye_blink[0] + f.eye_blink[1]) / 2.0 if ear_model < 0.7: #0.21 COUNTER += 1 closeness = 1 output_closeness.append(closeness) else: if COUNTER >= consec_th: TOTAL += 1 blink_start = current_frame - COUNTER blink_end = current_frame - 1 blink_info = (blink_start, blink_end) output_blinks.append(blink_info) COUNTER = 0 closeness = 0 output_closeness.append(closeness) frame_info = { 'frame_no': current_frame, 'face_detected': 1, 'face_coordinates': 0, 'left_eye_coor': 0, 'right_eye_coor': 0, 'left_ear': l_eye_ratio, 'right_ear': r_eye_ratio, 'avg_ear': ear, 'avg_ear_model': ear_model, 'closeness': closeness, 'blink_no': TOTAL, 'blink_start_frame': blink_start, 'blink_end_frame': blink_end, 'reserved_for_calibration': False } frame_info_list.append(frame_info) processed_frame.append(frame) current_frame += 1 # frame_info_df = pd.DataFrame(frame_info_list) # debug framecount += 1 if not log is None: log.write(f"{framecount},{now},{width},{height},{args.fps},{face_num},{f.id},{f.eye_blink[0]},{f.eye_blink[1]},{f.conf},{f.success},{f.pnp_error},{f.quaternion[0]},{f.quaternion[1]},{f.quaternion[2]},{f.quaternion[3]},{f.euler[0]},{f.euler[1]},{f.euler[2]},{f.rotation[0]},{f.rotation[1]},{f.rotation[2]},{f.translation[0]},{f.translation[1]},{f.translation[2]}") for (x,y,c) in f.lms: packet.extend(bytearray(struct.pack("f", c))) if args.visualize > 1: frame = cv2.putText(frame, str(f.id), (int(f.bbox[0]), int(f.bbox[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255,0,0)) frame = cv2.putText(frame, "FPS : %0.1f" % fps, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0,255,0), 1, cv2.LINE_AA) if args.visualize > 2: frame = cv2.putText(frame, f"{f.conf:.4f}", (int(f.bbox[0] + 18), int(f.bbox[1] - 6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255)) # cv2.imwrite('frame.jpg', frame) for pt_num, (x,y,c) in enumerate(f.lms): packet.extend(bytearray(struct.pack("f", y))) packet.extend(bytearray(struct.pack("f", x))) if not log is None: log.write(f",{y},{x},{c}") if pt_num == 66 and (f.eye_blink[0] < 0.30 or c < 0.30): continue if pt_num == 67 and (f.eye_blink[1] < 0.30 or c < 0.30): continue x = int(x + 0.5) y = int(y + 0.5) if args.visualize != 0 or not out is None: if args.visualize > 3: frame = cv2.putText(frame, str(pt_num), (int(y), int(x)), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (255,255,0)) color = (0, 255, 0) if pt_num >= 66: color = (255, 255, 0) if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = color x += 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = color y += 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = color x -= 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = color if args.pnp_points != 0 and (args.visualize != 0 or not out is None) and f.rotation is not None: if args.pnp_points > 1: projected = cv2.projectPoints(f.face_3d[0:66], f.rotation, f.translation, tracker.camera, tracker.dist_coeffs) else: projected = cv2.projectPoints(f.contour, f.rotation, f.translation, tracker.camera, tracker.dist_coeffs) for [(x,y)] in projected[0]: x = int(x + 0.5) y = int(y + 0.5) if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = (0, 255, 255) x += 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = (0, 255, 255) y += 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = (0, 255, 255) x -= 1 if not (x < 0 or y < 0 or x >= height or y >= width): frame[int(x), int(y)] = (0, 255, 255) for (x,y,z) in f.pts_3d: packet.extend(bytearray(struct.pack("f", x))) packet.extend(bytearray(struct.pack("f", -y))) packet.extend(bytearray(struct.pack("f", -z))) if not log is None: log.write(f",{x},{-y},{-z}") if f.current_features is None: f.current_features = {} for feature in features: if not feature in f.current_features: f.current_features[feature] = 0 packet.extend(bytearray(struct.pack("f", f.current_features[feature]))) if not log is None: log.write(f",{f.current_features[feature]}") if not log is None: log.write("\r\n") log.flush() if detected and len(faces) < 40: sock.sendto(packet, (target_ip, target_port)) if not out is None: video_frame = frame if args.video_scale != 1: video_frame = cv2.resize(frame, (width * args.video_scale, height * args.video_scale), interpolation=cv2.INTER_NEAREST) out.write(video_frame) if args.video_scale != 1: del video_frame failures = 0 except Exception as e: if e.__class__ == KeyboardInterrupt: if args.silent == 0: print("Quitting") break traceback.print_exc() failures += 1 if failures > 30: break collected = False del frame duration = time.perf_counter() - frame_time while duration < target_duration: if not collected: gc.collect() collected = True duration = time.perf_counter() - frame_time sleep_time = target_duration - duration if sleep_time > 0: time.sleep(sleep_time) duration = time.perf_counter() - frame_time frame_time = time.perf_counter() except KeyboardInterrupt: if args.silent == 0: print("Quitting") input_reader.close() if not out is None: out.release() cv2.destroyAllWindows() if args.silent == 0 and tracking_frames > 0: average_tracking_time = 1000 * tracking_time / tracking_frames print(f"Average tracking time per detected face: {average_tracking_time:.2f} ms") print(f"Tracking time: {total_tracking_time:.3f} s\nFrames: {tracking_frames}") frame_info_df = pd.DataFrame(frame_info_list) frame_info_df['output_closeness'] = output_closeness file_name = os.path.basename(video_path) output_str = 'Processing {} has done.\n\n'.format(file_name) return frame_info_df, output_closeness, output_blinks, processed_frame, video_info_dict, output_str