Example #1
0
import numpy as np
import time
import cv2
import socket
import struct
import json
from input_reader import InputReader, VideoReader, DShowCaptureReader, try_int
from tracker import Tracker, get_model_base_path

if args.benchmark > 0:
    model_base_path = get_model_base_path(args.model_dir)
    im = cv2.imread(os.path.join(model_base_path, "benchmark.bin"), cv2.IMREAD_COLOR)
    results = []
    for model_type in [3, 2, 1, 0, -1, -2, -3]:
        tracker = Tracker(224, 224, threshold=0.1, max_threads=args.max_threads, max_faces=1, discard_after=0, scan_every=0, silent=True, model_type=model_type, model_dir=args.model_dir, no_gaze=(model_type == -1), detection_threshold=0.1, use_retinaface=0, max_feature_updates=900, static_model=True if args.no_3d_adapt == 1 else False)
        tracker.detected = 1
        tracker.faces = [(0, 0, 224, 224)]
        total = 0.0
        for i in range(100):
            start = time.perf_counter()
            r = tracker.predict(im)
            total += time.perf_counter() - start
        print(1. / (total / 100.))
    sys.exit(0)

target_ip = args.ip
target_port = args.port

if args.faces >= 40:
    print("Transmission of tracking data over network is not supported with 40 or more faces.")
def process_video(video_path):
    output_logfile = None
    if args.log_output != "":
        output_logfile = open(args.log_output, "w")
    sys.stdout = OutputLog(output_logfile, sys.stdout)
    sys.stderr = OutputLog(output_logfile, sys.stderr)

    if os.name == 'nt':
        import dshowcapture
        if args.blackmagic == 1:
            dshowcapture.set_bm_enabled(True)
        if not args.blackmagic_options is None:
            dshowcapture.set_options(args.blackmagic_options)
        if not args.priority is None:
            import psutil
            classes = [psutil.IDLE_PRIORITY_CLASS, psutil.BELOW_NORMAL_PRIORITY_CLASS, psutil.NORMAL_PRIORITY_CLASS, psutil.ABOVE_NORMAL_PRIORITY_CLASS, psutil.HIGH_PRIORITY_CLASS, psutil.REALTIME_PRIORITY_CLASS]
            p = psutil.Process(os.getpid())
            p.nice(classes[args.priority])

    if os.name == 'nt' and (args.list_cameras > 0 or not args.list_dcaps is None):
        cap = dshowcapture.DShowCapture()
        info = cap.get_info()
        unit = 10000000.;
        if not args.list_dcaps is None:
            formats = {0: "Any", 1: "Unknown", 100: "ARGB", 101: "XRGB", 200: "I420", 201: "NV12", 202: "YV12", 203: "Y800", 300: "YVYU", 301: "YUY2", 302: "UYVY", 303: "HDYC (Unsupported)", 400: "MJPEG", 401: "H264" }
            for cam in info:
                if args.list_dcaps == -1:
                    type = ""
                    if cam['type'] == "Blackmagic":
                        type = "Blackmagic: "
                    print(f"{cam['index']}: {type}{cam['name']}")
                if args.list_dcaps != -1 and args.list_dcaps != cam['index']:
                    continue
                for caps in cam['caps']:
                    format = caps['format']
                    if caps['format'] in formats:
                        format = formats[caps['format']]
                    if caps['minCX'] == caps['maxCX'] and caps['minCY'] == caps['maxCY']:
                        print(f"    {caps['id']}: Resolution: {caps['minCX']}x{caps['minCY']} FPS: {unit/caps['maxInterval']:.3f}-{unit/caps['minInterval']:.3f} Format: {format}")
                    else:
                        print(f"    {caps['id']}: Resolution: {caps['minCX']}x{caps['minCY']}-{caps['maxCX']}x{caps['maxCY']} FPS: {unit/caps['maxInterval']:.3f}-{unit/caps['minInterval']:.3f} Format: {format}")
        else:
            if args.list_cameras == 1:
                print("Available cameras:")
            for cam in info:
                type = ""
                if cam['type'] == "Blackmagic":
                    type = "Blackmagic: "
                if args.list_cameras == 1:
                    print(f"{cam['index']}: {type}{cam['name']}")
                else:
                    print(f"{type}{cam['name']}")
        cap.destroy_capture()
        sys.exit(0)

    import numpy as np
    import time
    import cv2
    import socket
    import struct
    import json
    from input_reader import InputReader, VideoReader, DShowCaptureReader, try_int
    from tracker import Tracker, get_model_base_path
    from tqdm import tqdm

    if args.benchmark > 0:
        model_base_path = get_model_base_path(args.model_dir)
        im = cv2.imread(os.path.join(model_base_path, "benchmark.bin"), cv2.IMREAD_COLOR)
        results = []
        for model_type in [3, 2, 1, 0, -1, -2, -3]:
            tracker = Tracker(224, 224, threshold=0.1, max_threads=args.max_threads, max_faces=1, discard_after=0, scan_every=0, silent=True, model_type=model_type, model_dir=args.model_dir, no_gaze=(model_type == -1), detection_threshold=0.1, use_retinaface=0, max_feature_updates=900, static_model=True if args.no_3d_adapt == 1 else False)
            tracker.detected = 1
            tracker.faces = [(0, 0, 224, 224)]
            total = 0.0
            for i in range(100):
                start = time.perf_counter()
                r = tracker.predict(im)
                # r = tracker.predict(im)
                total += time.perf_counter() - start
            print(1. / (total / 100.))
        sys.exit(0)

    target_ip = args.ip
    target_port = args.port

    if args.faces >= 40:
        print("Transmission of tracking data over network is not supported with 40 or more faces.")

    fps = 24
    dcap = None
    use_dshowcapture_flag = False
    if os.name == 'nt':
        fps = args.fps
        dcap = args.dcap
        use_dshowcapture_flag = True if args.use_dshowcapture == 1 else False
        input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag, dcap=dcap)
        if args.dcap == -1 and type(input_reader) == DShowCaptureReader:
            fps = min(fps, input_reader.device.get_fps())
    else:
        input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag)

    # if type(input_reader.reader) == VideoReader:
    #     fps = 0.0

    log = None
    out = None
    first = True
    height = 0
    width = 0
    tracker = None
    sock = None
    total_tracking_time = 0.0
    tracking_time = 0.0
    tracking_frames = 0
    framecount = 0
    eye_blink_frames = 0
    eye_blink_lst = []
    eye_blink_temp = []

    COUNTER = 0
    TOTAL = 0
    current_frame = 1
    blink_start = 0
    blink_end = 0
    closeness = 0
    output_closeness = []
    output_blinks = []
    blink_info = (0,0)
    processed_frame = []
    frame_info_list = []
    lStart = 42
    lEnd = 48
    rStart = 36
    rEnd = 42
    ear_th = 0.18
    consec_th = 3
    up_to = None


    array_blink_threshold = list()
    ear_list = list()
    col=['F1',"F2","F3","F4","F5",'F6',"F7", "F8", "F9", "F10", "F11", "F12", "F13"]

    features = ["eye_l", "eye_r", "eyebrow_steepness_l", "eyebrow_updown_l", "eyebrow_quirk_l", "eyebrow_steepness_r", "eyebrow_updown_r", "eyebrow_quirk_r", "mouth_corner_updown_l", "mouth_corner_inout_l", "mouth_corner_updown_r", "mouth_corner_inout_r", "mouth_open", "mouth_wide"]

    if args.log_data != "":
        log = open(args.log_data, "w")
        log.write("Frame,Time,Width,Height,FPS,Face,FaceID,RightOpen,LeftOpen,AverageConfidence,Success3D,PnPError,RotationQuat.X,RotationQuat.Y,RotationQuat.Z,RotationQuat.W,Euler.X,Euler.Y,Euler.Z,RVec.X,RVec.Y,RVec.Z,TVec.X,TVec.Y,TVec.Z")
        for i in range(66):
            log.write(f",Landmark[{i}].X,Landmark[{i}].Y,Landmark[{i}].Confidence")
        for i in range(66):
            log.write(f",Point3D[{i}].X,Point3D[{i}].Y,Point3D[{i}].Z")
        for feature in features:
            log.write(f",{feature}")
        log.write("\r\n")
        log.flush()

    is_camera = video_path == str(try_int(video_path))

    try:
        attempt = 0
        frame_time = time.perf_counter()
        target_duration = 0
        if fps > 0:
            target_duration = 1. / float(fps)
        repeat = args.repeat_video != 0 and type(input_reader.reader) == VideoReader
        need_reinit = 0
        failures = 0
        source_name = input_reader.name
        blink_count = 0
        blink_count_origin = 0
    
        while repeat or input_reader.is_open():
            if not input_reader.is_open() or need_reinit == 1:
                input_reader = InputReader(video_path, args.raw_rgb, args.width, args.height, fps, use_dshowcapture=use_dshowcapture_flag, dcap=dcap)
                if input_reader.name != source_name:
                    print(f"Failed to reinitialize camera and got {input_reader.name} instead of {source_name}.")
                    sys.exit(1)
                need_reinit = 2
                time.sleep(0.02)
                continue
            if not input_reader.is_ready():
                time.sleep(0.02)
                continue
            
            ret, frame = input_reader.read()

            fps = input_reader.get_fps()
            frame_count = int(input_reader.get_frame())
            duration = frame_count/fps
            video_info_dict = {
                'fps': fps,
                'frame_count': frame_count,
                'duration(s)': duration
            }

            # frame = cv2.flip(frame,1)
            #2 -50 - 0.5 -20,-50
            # frame = cv2.convertScaleAbs(frame, -1, 0.5, -20)

            if not ret:
                if repeat:
                    if need_reinit == 0:
                        need_reinit = 1
                    continue
                elif is_camera:
                    attempt += 1
                    if attempt > 30:
                        break
                    else:
                        time.sleep(0.02)
                        if attempt == 3:
                            need_reinit = 1
                        continue
                else:
                    break;

            attempt = 0
            need_reinit = 0
            # frame_count += 1
            now = time.time()

            if first:
                first = False
                height, width, channels = frame.shape
                sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                tracker = Tracker(width, height, threshold=args.threshold, max_threads=args.max_threads, max_faces=args.faces, discard_after=args.discard_after, scan_every=args.scan_every, silent=False if args.silent == 0 else True, model_type=args.model, model_dir=args.model_dir, no_gaze=False if args.gaze_tracking != 0 and args.model != -1 else True, detection_threshold=args.detection_threshold, use_retinaface=args.scan_retinaface, max_feature_updates=args.max_feature_updates, static_model=True if args.no_3d_adapt == 1 else False, try_hard=args.try_hard == 1)
                if not args.video_out is None:
                    out = cv2.VideoWriter(args.video_out, cv2.VideoWriter_fourcc('F','F','V','1'), args.video_fps, (width * args.video_scale, height * args.video_scale))

            try:
                inference_start = time.perf_counter()
                faces = tracker.predict(frame)
                # faces = tracker.predict(frame)
                if len(faces) > 0:
                    inference_time = (time.perf_counter() - inference_start)
                    total_tracking_time += inference_time
                    tracking_time += inference_time / len(faces)
                    tracking_frames += 1
                # else:
                #     ear_list.append(np.nan)
                #     array_blink_threshold.append(np.nan)
                packet = bytearray()
                detected = False
                r_eye_roi_resize = []
                l_eye_roi_resize = []
                head_list = []


                for face_num, f in enumerate(faces):
                    f = copy.copy(f)
                    f.id += args.face_id_offset
                    if f.eye_blink is None:
                        f.eye_blink = [1, 1]

                    right_state = "O" if f.eye_blink[0] > 0.30 else "-"
                    left_state = "O" if f.eye_blink[1] > 0.30 else "-"
                    if f.eye_blink[0] < 0.7 or f.eye_blink[1] < 0.7:
                        eye_blink_frames += 1

                    if args.silent == 0:
                        print(f"Confidence[{f.id}]: {f.conf:.4f} / 3D fitting error: {f.pnp_error:.4f} / Eyes: {left_state}, {right_state}")
                    
                    detected = True
                    if not f.success:
                        pts_3d = np.zeros((70, 3), np.float32)
                    packet.extend(bytearray(struct.pack("d", now)))
                    packet.extend(bytearray(struct.pack("i", f.id)))
                    packet.extend(bytearray(struct.pack("f", width)))
                    packet.extend(bytearray(struct.pack("f", height)))
                    packet.extend(bytearray(struct.pack("f", f.eye_blink[0])))
                    packet.extend(bytearray(struct.pack("f", f.eye_blink[1])))
                    packet.extend(bytearray(struct.pack("B", 1 if f.success else 0)))
                    packet.extend(bytearray(struct.pack("f", f.pnp_error)))
                    packet.extend(bytearray(struct.pack("f", f.quaternion[0])))
                    packet.extend(bytearray(struct.pack("f", f.quaternion[1])))
                    packet.extend(bytearray(struct.pack("f", f.quaternion[2])))
                    packet.extend(bytearray(struct.pack("f", f.quaternion[3])))
                    packet.extend(bytearray(struct.pack("f", f.euler[0])))
                    packet.extend(bytearray(struct.pack("f", f.euler[1])))
                    packet.extend(bytearray(struct.pack("f", f.euler[2])))
                    packet.extend(bytearray(struct.pack("f", f.translation[0])))
                    packet.extend(bytearray(struct.pack("f", f.translation[1])))
                    packet.extend(bytearray(struct.pack("f", f.translation[2])))

                    r_eye = [[f.lms[36][1], f.lms[36][0]], [f.lms[37][1], f.lms[37][0]], [f.lms[38][1], f.lms[38][0]], [f.lms[39][1], f.lms[39][0]], [f.lms[40][1], f.lms[40][0]], [f.lms[41][1], f.lms[41][0]]]
                    l_eye = [[f.lms[42][1], f.lms[42][0]], [f.lms[43][1], f.lms[43][0]], [f.lms[44][1], f.lms[44][0]], [f.lms[45][1], f.lms[45][0]], [f.lms[46][1], f.lms[46][0]], [f.lms[47][1], f.lms[47][0]]]
                    l_eye_ratio = eye_aspect_ratio(l_eye)
                    r_eye_ratio = eye_aspect_ratio(r_eye)
                    ear = (l_eye_ratio + r_eye_ratio) / 2.0
                    ear_model = (f.eye_blink[0] + f.eye_blink[1]) / 2.0

                    if ear_model < 0.7: #0.21
                        COUNTER += 1
                        closeness = 1
                        output_closeness.append(closeness)
                    else:
                        if COUNTER >= consec_th:
                            TOTAL += 1
                            blink_start = current_frame - COUNTER
                            blink_end = current_frame - 1
                            blink_info = (blink_start, blink_end)
                            output_blinks.append(blink_info)
                        COUNTER = 0
                        closeness = 0
                        output_closeness.append(closeness)
                    
                    frame_info = {
                        'frame_no': current_frame,
                        'face_detected': 1,
                        'face_coordinates': 0,
                        'left_eye_coor': 0,
                        'right_eye_coor': 0,
                        'left_ear': l_eye_ratio,
                        'right_ear': r_eye_ratio,
                        'avg_ear': ear,
                        'avg_ear_model': ear_model,
                        'closeness': closeness,
                        'blink_no': TOTAL,
                        'blink_start_frame': blink_start,
                        'blink_end_frame': blink_end,
                        'reserved_for_calibration': False
                    }
                    frame_info_list.append(frame_info)
                    processed_frame.append(frame)
                    current_frame += 1
                    # frame_info_df = pd.DataFrame(frame_info_list) # debug
                    
                    framecount += 1
                    if not log is None:
                        log.write(f"{framecount},{now},{width},{height},{args.fps},{face_num},{f.id},{f.eye_blink[0]},{f.eye_blink[1]},{f.conf},{f.success},{f.pnp_error},{f.quaternion[0]},{f.quaternion[1]},{f.quaternion[2]},{f.quaternion[3]},{f.euler[0]},{f.euler[1]},{f.euler[2]},{f.rotation[0]},{f.rotation[1]},{f.rotation[2]},{f.translation[0]},{f.translation[1]},{f.translation[2]}")
                    for (x,y,c) in f.lms:
                        packet.extend(bytearray(struct.pack("f", c)))
                    if args.visualize > 1:
                        frame = cv2.putText(frame, str(f.id), (int(f.bbox[0]), int(f.bbox[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255,0,0))
                        frame = cv2.putText(frame, "FPS : %0.1f" % fps, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0,255,0), 1, cv2.LINE_AA)

                    if args.visualize > 2:
                        frame = cv2.putText(frame, f"{f.conf:.4f}", (int(f.bbox[0] + 18), int(f.bbox[1] - 6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255))
                    
                    # cv2.imwrite('frame.jpg', frame)
                    for pt_num, (x,y,c) in enumerate(f.lms):
                        packet.extend(bytearray(struct.pack("f", y)))
                        packet.extend(bytearray(struct.pack("f", x)))
                        if not log is None:
                            log.write(f",{y},{x},{c}")
                        if pt_num == 66 and (f.eye_blink[0] < 0.30 or c < 0.30):
                            continue
                        if pt_num == 67 and (f.eye_blink[1] < 0.30 or c < 0.30):
                            continue
                        x = int(x + 0.5)
                        y = int(y + 0.5)
                        if args.visualize != 0 or not out is None:
                            if args.visualize > 3:
                                frame = cv2.putText(frame, str(pt_num), (int(y), int(x)), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (255,255,0))
                            color = (0, 255, 0)
                            if pt_num >= 66:
                                color = (255, 255, 0)
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = color
                            x += 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = color
                            y += 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = color
                            x -= 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = color
                    if args.pnp_points != 0 and (args.visualize != 0 or not out is None) and f.rotation is not None:
                        if args.pnp_points > 1:
                            projected = cv2.projectPoints(f.face_3d[0:66], f.rotation, f.translation, tracker.camera, tracker.dist_coeffs)
                        else:
                            
                            projected = cv2.projectPoints(f.contour, f.rotation, f.translation, tracker.camera, tracker.dist_coeffs)
                        for [(x,y)] in projected[0]:
                            x = int(x + 0.5)
                            y = int(y + 0.5)
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = (0, 255, 255)
                            x += 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = (0, 255, 255)
                            y += 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = (0, 255, 255)
                            x -= 1
                            if not (x < 0 or y < 0 or x >= height or y >= width):
                                frame[int(x), int(y)] = (0, 255, 255)
                    for (x,y,z) in f.pts_3d:
                        packet.extend(bytearray(struct.pack("f", x)))
                        packet.extend(bytearray(struct.pack("f", -y)))
                        packet.extend(bytearray(struct.pack("f", -z)))
                        if not log is None:
                            log.write(f",{x},{-y},{-z}")
                    if f.current_features is None:
                        f.current_features = {}

                    for feature in features:
                        if not feature in f.current_features:
                            f.current_features[feature] = 0
                        packet.extend(bytearray(struct.pack("f", f.current_features[feature])))
                        if not log is None:
                            log.write(f",{f.current_features[feature]}")
                    if not log is None:
                        log.write("\r\n")
                        log.flush()
                

                if detected and len(faces) < 40:
                    sock.sendto(packet, (target_ip, target_port))

                if not out is None:
                    video_frame = frame
                    if args.video_scale != 1:
                        video_frame = cv2.resize(frame, (width * args.video_scale, height * args.video_scale), interpolation=cv2.INTER_NEAREST)
                    out.write(video_frame)
                    if args.video_scale != 1:
                        del video_frame

                failures = 0
            except Exception as e:
                if e.__class__ == KeyboardInterrupt:
                    if args.silent == 0:
                        print("Quitting")
                    break
                traceback.print_exc()
                failures += 1
                if failures > 30:
                    break


            collected = False
            del frame

            duration = time.perf_counter() - frame_time
            while duration < target_duration:
                if not collected:
                    gc.collect()
                    collected = True
                duration = time.perf_counter() - frame_time
                sleep_time = target_duration - duration
                if sleep_time > 0:
                    time.sleep(sleep_time)
                duration = time.perf_counter() - frame_time
            frame_time = time.perf_counter()
            
    except KeyboardInterrupt:
        if args.silent == 0:
            print("Quitting")

    input_reader.close()
    if not out is None:
        out.release()
    cv2.destroyAllWindows()

    if args.silent == 0 and tracking_frames > 0:
        average_tracking_time = 1000 * tracking_time / tracking_frames
        print(f"Average tracking time per detected face: {average_tracking_time:.2f} ms")
        print(f"Tracking time: {total_tracking_time:.3f} s\nFrames: {tracking_frames}")

    frame_info_df = pd.DataFrame(frame_info_list)
    frame_info_df['output_closeness'] = output_closeness

    file_name = os.path.basename(video_path)
    output_str = 'Processing {} has done.\n\n'.format(file_name)
    return frame_info_df, output_closeness, output_blinks, processed_frame, video_info_dict, output_str