def main():

    args = get_args()

    image_size = FaceEmbeddingPredictor.IMG_SIZE

    # read images
    img1 = cv2.imread(args.input1.strip())
    img2 = cv2.imread(args.input2.strip())

    print("Image 1: {}".format(args.input1))
    print("Image 2: {}".format(args.input2))
    print("Verifying image 1 and 2")

    face_detector = RetinaFacePredictor(
        threshold=0.8,
        device=args.device.strip(),
        model=RetinaFacePredictor.get_model('resnet50'))

    det1 = face_detector(img1, rgb=False)[0].astype(int)
    bbox1 = det1[0:4]  # bounding box 1
    ldm1 = det1[5:].reshape((
        5,
        2,
    ))  # landmarks 1

    det2 = face_detector(img2, rgb=False)[0].astype(int)
    bbox2 = det2[0:4]  # bounding box 2
    ldm2 = det2[5:].reshape((
        5,
        2,
    ))  # landmarks 2

    if bool(args.align_face):
        # crop the faces and align them using landmarks
        print('Faces will be aligned')
        face1 = norm_crop(img1, ldm1, image_size=image_size[0])
        face2 = norm_crop(img2, ldm2, image_size=image_size[0])
    else:
        print('Faces will not be aligned')
        # crop the faces, but do not align them
        face1 = crop_face(img1, bbox1, extend=0.2, target_size=image_size)
        face2 = crop_face(img2, bbox2, extend=0.2, target_size=image_size)

    print('Loading embedding predictor with backbone: {}'.format(
        args.backbone))
    embedding_predictor = FaceEmbeddingPredictor(
        backbone=args.backbone,
        project_to_space=args.project_to_space,
        model_path=args.model_path,
        device=args.device)

    if args.project_to_space is not None:
        print("Projecting images into space: {} ".format(
            args.project_to_space))
    print('Extracting face embeddings ...')
    start_time = time.time()
    embeddings1 = embedding_predictor(face1,
                                      bgr=True,
                                      flip=bool(args.flip),
                                      normalize_embedding=True)
    embeddings2 = embedding_predictor(face2,
                                      bgr=True,
                                      flip=bool(args.flip),
                                      normalize_embedding=True)

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff))

    end_time = time.time()
    print("Verification time: {:.3f} sec".format(end_time - start_time))

    if dist < args.threshold:
        print("Embedding distance ({:.2f}) < Threshold ({:.2f})".format(
            dist, args.threshold))
        print("Image 1 and 2 are from the same person\n")
    else:
        print("Embedding distance ({:.2f}) > Threshold ({:.2f})".format(
            dist, args.threshold))
        print("Image 1 and 2 are from different persons\n")
def main() -> None:
    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        help='Input video path or webcam index',
                        default=0)
    parser.add_argument('--output',
                        '-o',
                        help='Output file path',
                        default=None)
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-display',
                        '-n',
                        help='No display if processing a video file',
                        action='store_true',
                        default=False)

    parser.add_argument(
        '--detection-threshold',
        '-dt',
        type=float,
        default=0.8,
        help='Confidence threshold for face detection (default=0.8)')
    parser.add_argument(
        '--detection-method',
        '-dm',
        default='retinaface',
        help=
        'Face detection method, can be either RatinaFace or S3FD (default=RatinaFace)'
    )
    parser.add_argument(
        '--detection-weights',
        '-dw',
        default=None,
        help='Weights to be loaded for face detection, ' +
        'can be either resnet50 or mobilenet0.25 when using RetinaFace')
    parser.add_argument(
        '--detection-device',
        '-dd',
        default='cuda:0',
        help='Device to be used for face detection (default=cuda:0)')

    parser.add_argument(
        '--alignment-threshold',
        '-at',
        type=float,
        default=0.2,
        help=
        'Score threshold used when visualising detected landmarks (default=0.2)'
    ),
    parser.add_argument('--alignment-method',
                        '-am',
                        default='fan',
                        help='Face alignment method, must be set to FAN')
    parser.add_argument(
        '--alignment-weights',
        '-aw',
        default=None,
        help=
        'Weights to be loaded for face alignment, can be either 2DFAN2 or 2DFAN4'
    )
    parser.add_argument(
        '--alignment-device',
        '-ad',
        default='cuda:0',
        help='Device to be used for face alignment (default=cuda:0)')

    parser.add_argument(
        '--tddfa-weights',
        '-tw',
        default=None,
        help='Weights to be loaded by 3DDFA, must be set to mobilenet1')
    parser.add_argument('--tddfa-device',
                        '-td',
                        default='cuda:0',
                        help='Device to be used by 3DDFA.')
    args = parser.parse_args()

    # Set benchmark mode flag for CUDNN
    torch.backends.cudnn.benchmark = args.benchmark

    vid = None
    out_vid = None
    has_window = False
    try:
        # Create the face detector
        args.detection_method = args.detection_method.lower()
        if args.detection_method == 'retinaface':
            face_detector = RetinaFacePredictor(
                threshold=args.detection_threshold,
                device=args.detection_device,
                model=(RetinaFacePredictor.get_model(args.detection_weights)
                       if args.detection_weights else None))
            print('Face detector created using RetinaFace.')
        elif args.detection_method == 's3fd':
            face_detector = S3FDPredictor(
                threshold=args.detection_threshold,
                device=args.detection_device,
                model=(S3FDPredictor.get_model(args.detection_weights)
                       if args.detection_weights else None))
            print('Face detector created using S3FD.')
        else:
            raise ValueError(
                'detector-method must be set to either RetinaFace or S3FD')

        # Create the landmark detector
        args.alignment_method = args.alignment_method.lower()
        if args.alignment_method == 'fan':
            landmark_detector = FANPredictor(
                device=args.alignment_device,
                model=(FANPredictor.get_model(args.alignment_weights)
                       if args.alignment_weights else None))
            print('Landmark detector created using FAN.')
        else:
            raise ValueError('alignment-method must be set to FAN')

        # Instantiate 3DDFA
        tddfa = TDDFAPredictor(
            device=args.tddfa_device,
            model=(TDDFAPredictor.get_model(args.tddfa_weights)
                   if args.tddfa_weights else None))
        print('3DDFA initialised.')

        # Open the input video
        using_webcam = not os.path.exists(args.input)
        vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
        assert vid.isOpened()
        if using_webcam:
            print(f'Webcam #{int(args.input)} opened.')
        else:
            print(f'Input video "{args.input}" opened.')

        # Open the output video (if a path is given)
        if args.output is not None:
            out_vid = cv2.VideoWriter(
                args.output,
                apiPreference=cv2.CAP_FFMPEG,
                fps=vid.get(cv2.CAP_PROP_FPS),
                frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
                           int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))),
                fourcc=cv2.VideoWriter_fourcc('m', 'p', '4', 'v'))

        # Process the frames
        frame_number = 0
        window_title = os.path.splitext(os.path.basename(__file__))[0]
        print('Processing started, press \'Q\' to quit.')
        while True:
            # Get a new frame
            _, frame = vid.read()
            if frame is None:
                break
            else:
                # Detect faces
                start_time = time.time()
                faces = face_detector(frame, rgb=False)
                current_time = time.time()
                elapsed_time = current_time - start_time

                # Face alignment
                start_time = current_time
                landmarks, scores = landmark_detector(frame, faces, rgb=False)
                current_time = time.time()
                elapsed_time2 = current_time - start_time

                ss = time.time()
                lala = TDDFAPredictor.decode(
                    tddfa(frame, landmarks, rgb=False, two_steps=True))
                print(time.time() - ss)

                # Textural output
                print(
                    f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} + '
                    +
                    f'{elapsed_time2 * 1000.0:.04f} ms: {len(faces)} faces analysed..'
                )

                # Rendering
                for face, yy in zip(faces, lala):
                    bbox = face[:4].astype(int)
                    cv2.rectangle(frame, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]),
                                  color=(0, 0, 255),
                                  thickness=2)
                    lm = tddfa.project_vertex(yy, False)
                    plot_landmarks(frame, lm[:, :2])
                    if len(face) > 5:
                        plot_landmarks(frame,
                                       face[5:].reshape((-1, 2)),
                                       pts_radius=3)

                # Write the frame to output video (if recording)
                if out_vid is not None:
                    out_vid.write(frame)

                # Display the frame
                if using_webcam or not args.no_display:
                    has_window = True
                    cv2.imshow(window_title, frame)
                    key = cv2.waitKey(1) % 2**16
                    if key == ord('q') or key == ord('Q'):
                        print('\'Q\' pressed, we are done here.')
                        break
                frame_number += 1
    finally:
        if has_window:
            cv2.destroyAllWindows()
        if out_vid is not None:
            out_vid.release()
        if vid is not None:
            vid.release()
        print('All done.')
def main() -> None:
    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        help='Input video path or webcam index (default=0)',
                        default=0)
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '--weights',
        '-w',
        default=None,
        help='Weights to be loaded by 3DDFA, must be set to mobilenet1')
    parser.add_argument('--device',
                        '-d',
                        default='cuda:0',
                        help='Device to be used by all models (default=cuda:0')

    parser.add_argument(
        '--alignment-weights',
        '-aw',
        default='2dfan2_alt',
        help=
        'Weights to be loaded for face alignment, can be either 2DFAN2, 2DFAN4, '
        + 'or 2DFAN2_ALT (default=2DFAN2_ALT)')
    parser.add_argument(
        '--alignment-alternative-pth',
        '-ap',
        default=None,
        help='Alternative pth file to be loaded for face alignment')
    parser.add_argument('--alignment-alternative-landmarks',
                        '-al',
                        default=None,
                        help='Alternative number of landmarks to detect')
    args = parser.parse_args()

    # Set benchmark mode flag for CUDNN
    torch.backends.cudnn.benchmark = args.benchmark

    vid = None
    try:
        # Create the face detector
        face_detector = RetinaFacePredictor(device=args.device)
        print('Face detector created.')

        # Create the landmark detector
        if args.alignment_weights is None:
            fa_model = FANPredictor.get_model()
        else:
            fa_model = FANPredictor.get_model(args.alignment_weights)
        if args.alignment_alternative_pth is not None:
            fa_model.weights = args.alignment_alternative_pth
        if args.alignment_alternative_landmarks is not None:
            fa_model.config.num_landmarks = int(
                args.alignment_alternative_landmarks)
        landmark_detector = FANPredictor(device=args.device, model=fa_model)
        print('Landmark detector created.')

        # Instantiate 3DDFA
        tddfa = TDDFAPredictor(device=args.device,
                               model=(TDDFAPredictor.get_model(args.weights)
                                      if args.weights else None))
        print('3DDFA initialised.')

        # Create the face pose augmentor
        augmentor = FacePoseAugmentor()
        print('Face pose augmentor created.')

        # Open the input video
        using_webcam = not os.path.exists(args.input)
        vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
        assert vid.isOpened()
        if using_webcam:
            print(f'Webcam #{int(args.input)} opened.')
        else:
            print(f'Input video "{args.input}" opened.')

        # The main processing loop
        landmark_style_index = 4
        window_title = os.path.splitext(os.path.basename(__file__))[0]
        while True:
            frame, landmarks = face_detection_loop(vid, face_detector,
                                                   landmark_detector,
                                                   window_title)
            cv2.destroyAllWindows()
            if frame is None or landmarks is None:
                break
            else:
                landmark_style_index = face_pose_augmentation_loop(
                    tddfa, augmentor, frame, landmarks, landmark_style_index,
                    window_title)
                if landmark_style_index < 0:
                    break
    finally:
        cv2.destroyAllWindows()
        if vid is not None:
            vid.release()
        print('All done.')
Beispiel #4
0
def main() -> None:
    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        help='Input video path or webcam index (default=0)',
                        default=0)
    parser.add_argument('--output',
                        '-o',
                        help='Output file path',
                        default=None)
    parser.add_argument('--fourcc',
                        '-f',
                        help='FourCC of the output video (default=mp4v)',
                        type=str,
                        default='mp4v')
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-display',
                        '-n',
                        help='No display if processing a video file',
                        action='store_true',
                        default=False)
    parser.add_argument('--threshold',
                        '-t',
                        help='Confidence threshold (default=0.8)',
                        type=float,
                        default=0.8)
    parser.add_argument(
        '--method',
        '-m',
        help=
        'Method to use, can be either RatinaFace or S3FD (default=RatinaFace)',
        default='retinaface')
    parser.add_argument(
        '--weights',
        '-w',
        help=
        'Weights to load, can be either resnet50 or mobilenet0.25 when using RetinaFace',
        default=None)
    parser.add_argument('--device',
                        '-d',
                        help='Device to be used by the model (default=cuda:0)',
                        default='cuda:0')
    parser.add_argument(
        '--iou-threshold',
        '-iou',
        help='IOU threshold used by the simple face tracker (default=0.4)',
        type=float,
        default=0.4)
    parser.add_argument(
        '--minimum-face-size',
        '-min',
        help='Minimum face size used by the simple face tracker (default=0.0)',
        type=float,
        default=0.0)
    parser.add_argument('--head-pose-preference',
                        '-hp',
                        help='Head pose output preference (default=0)',
                        type=int,
                        default=0)
    args = parser.parse_args()

    # Set benchmark mode flag for CUDNN
    torch.backends.cudnn.benchmark = args.benchmark

    vid = None
    out_vid = None
    has_window = False
    try:
        # Create the face detector
        args.method = args.method.lower().strip()
        if args.method == 'retinaface':
            face_detector = RetinaFacePredictor(
                threshold=args.threshold,
                device=args.device,
                model=(RetinaFacePredictor.get_model(args.weights)
                       if args.weights else None))
            print('Face detector created using RetinaFace.')
        elif args.method == 's3fd':
            face_detector = S3FDPredictor(
                threshold=args.threshold,
                device=args.device,
                model=(S3FDPredictor.get_model(args.weights)
                       if args.weights else None))
            print('Face detector created using S3FD.')
        else:
            raise ValueError('method must be set to either RetinaFace or S3FD')

        # Create the simple face tracker
        face_tracker = SimpleFaceTracker(
            iou_threshold=args.iou_threshold,
            minimum_face_size=args.minimum_face_size)
        print('Simple face tracker created.')

        # Create the head pose estimator
        head_pose_estimator = HeadPoseEstimator()
        print('Head pose estimator created.')

        # Open the input video
        using_webcam = not os.path.exists(args.input)
        vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
        assert vid.isOpened()
        if using_webcam:
            print(f'Webcam #{int(args.input)} opened.')
        else:
            print(f'Input video "{args.input}" opened.')

        # Open the output video (if a path is given)
        if args.output is not None:
            out_vid = cv2.VideoWriter(
                args.output,
                fps=vid.get(cv2.CAP_PROP_FPS),
                frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
                           int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))),
                fourcc=cv2.VideoWriter_fourcc(*args.fourcc))
            assert out_vid.isOpened()

        # Process the frames
        frame_number = 0
        window_title = os.path.splitext(os.path.basename(__file__))[0]
        colours = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255),
                   (255, 0, 255), (255, 255, 0), (0, 128, 255), (128, 255, 0),
                   (255, 0, 128), (128, 0, 255), (0, 255, 128), (255, 128, 0)]
        print(
            'Processing started, press \'Q\' to quit or \'R\' to reset the tracker.'
        )
        while True:
            # Get a new frame
            _, frame = vid.read()
            if frame is None:
                break
            else:
                # Detect and track faces, also estimate head pose if landmarks are available
                start_time = time.time()
                faces = face_detector(frame, rgb=False)
                tids = face_tracker(faces)
                if faces.shape[1] >= 15:
                    head_poses = [
                        head_pose_estimator(
                            face[5:15].reshape((-1, 2)),
                            *frame.shape[1::-1],
                            output_preference=args.head_pose_preference)
                        for face in faces
                    ]
                else:
                    head_poses = [None] * len(faces.shape[0])
                elapsed_time = time.time() - start_time

                # Textural output
                print(
                    f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: '
                    + f'{len(faces)} faces detected.')

                # Rendering
                for face, tid, head_pose in zip(faces, tids, head_poses):
                    bbox = face[:4].astype(int)
                    if tid is None:
                        colour = (128, 128, 128)
                    else:
                        colour = colours[(tid - 1) % len(colours)]
                    cv2.rectangle(frame, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]),
                                  color=colour,
                                  thickness=2)
                    if len(face) > 5:
                        for pts in face[5:].reshape((-1, 2)):
                            cv2.circle(frame, tuple(pts.astype(int).tolist()),
                                       3, colour, -1)
                    if tid is not None:
                        cv2.putText(frame,
                                    f'Face {tid}', (bbox[0], bbox[1] - 10),
                                    cv2.FONT_HERSHEY_DUPLEX,
                                    0.6,
                                    colour,
                                    lineType=cv2.LINE_AA)
                    if head_pose is not None:
                        pitch, yaw, roll = head_pose
                        cv2.putText(frame,
                                    f'Pitch: {pitch:.1f}',
                                    (bbox[2] + 5, bbox[1] + 10),
                                    cv2.FONT_HERSHEY_DUPLEX,
                                    0.5,
                                    colour,
                                    lineType=cv2.LINE_AA)
                        cv2.putText(frame,
                                    f'Yaw: {yaw:.1f}',
                                    (bbox[2] + 5, bbox[1] + 30),
                                    cv2.FONT_HERSHEY_DUPLEX,
                                    0.5,
                                    colour,
                                    lineType=cv2.LINE_AA)
                        cv2.putText(frame,
                                    f'Roll: {roll:.1f}',
                                    (bbox[2] + 5, bbox[1] + 50),
                                    cv2.FONT_HERSHEY_DUPLEX,
                                    0.5,
                                    colour,
                                    lineType=cv2.LINE_AA)

                # Write the frame to output video (if recording)
                if out_vid is not None:
                    out_vid.write(frame)

                # Display the frame
                if using_webcam or not args.no_display:
                    has_window = True
                    cv2.imshow(window_title, frame)
                    key = cv2.waitKey(1) % 2**16
                    if key == ord('q') or key == ord('Q'):
                        print('\'Q\' pressed, we are done here.')
                        break
                    elif key == ord('r') or key == ord('R'):
                        print('\'R\' pressed, reset the tracker.')
                        face_tracker.reset()
                frame_number += 1
    finally:
        if has_window:
            cv2.destroyAllWindows()
        if out_vid is not None:
            out_vid.release()
        if vid is not None:
            vid.release()
        print('All done.')
Beispiel #5
0
def main() -> None:
    parser = ArgumentParser()
    parser.add_argument('--video', '-v', help='Video source')
    parser.add_argument('--width',
                        '-x',
                        help='Width of the warped image (default=256)',
                        type=int,
                        default=256)
    parser.add_argument('--height',
                        '-y',
                        help='Height of the warped image (default=256)',
                        type=int,
                        default=256)
    parser.add_argument('--offset',
                        '-o',
                        help='Angular offset, only used when polar>0',
                        type=float,
                        default=0.0)
    parser.add_argument('--restore',
                        '-r',
                        help='Show restored frames',
                        action='store_true',
                        default=False)
    parser.add_argument('--compare',
                        '-c',
                        help='Compare with reference implementation',
                        action='store_true',
                        default=False)
    parser.add_argument('--compare-direct',
                        '-t',
                        help='Compare with directly warped frames',
                        action='store_true',
                        default=False)
    parser.add_argument('--square',
                        '-s',
                        help='Use square-shaped detection box',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '--keep-aspect-ratio',
        '-k',
        help='Keep aspect ratio in tanh-polar or tanh-circular warping',
        action='store_true',
        default=False)
    parser.add_argument('--reverse',
                        '-i',
                        help='Perform computation in the reverse direction',
                        action='store_true',
                        default=False)
    parser.add_argument('--device',
                        '-d',
                        help='Device to be used (default=cuda:0)',
                        default='cuda:0')
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    # Make the models run a bit faster
    torch.backends.cudnn.benchmark = args.benchmark

    # Create face detector
    detector = RetinaFacePredictor(
        device=args.device,
        model=RetinaFacePredictor.get_model('mobilenet0.25'))
    print('RetinaFace detector created using mobilenet0.25 backbone.')

    # Open webcam
    if os.path.exists(args.video):
        vid = cv2.VideoCapture(args.video)
        print('Video file opened: %s.' % args.video)
    else:
        vid = cv2.VideoCapture(int(args.video))
        print('Webcam #%d opened.' % int(args.video))

    # Detect objects in the frames
    try:
        frame_number = 0
        script_name = os.path.splitext(os.path.basename(__file__))[0]
        print('Face detection started, press \'Q\' to quit.')
        while True:
            _, frame = vid.read()
            if frame is None:
                break
            else:
                # Face detection
                face_boxes = detector(frame, rgb=False)
                if len(face_boxes) > 0:
                    biggest_face_idx = int(
                        np.argmax([(bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
                                   for bbox in face_boxes]))

                    # Test the warping functions
                    start_time = time.time()
                    roi_tanh_polar_frame, roi_tanh_frame, restored_frame, diff_ref, diff_direct = test_pytorch_impl(
                        args.device, frame, face_boxes[biggest_face_idx],
                        args.width, args.height, args.offset / 180.0 * np.pi,
                        args.restore, args.compare, args.compare_direct,
                        args.square, args.keep_aspect_ratio, args.reverse)
                    elapsed_time = time.time() - start_time
                    print(
                        f'Frame #{frame_number}: Warped and processed in {elapsed_time * 1000.0: .1f} ms.'
                    )

                    # Rendering
                    for idx, bbox in enumerate(face_boxes):
                        if idx == biggest_face_idx:
                            border_colour = (0, 0, 255)
                        else:
                            border_colour = (128, 128, 128)
                        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                                      (int(bbox[2]), int(bbox[3])),
                                      color=border_colour,
                                      thickness=2)
                else:
                    roi_tanh_polar_frame = None
                    roi_tanh_frame = None
                    restored_frame = None
                    diff_ref = None
                    diff_direct = None
                    print(f'Frame #{frame_number}: No face detected.')

                # Show the result
                cv2.imshow(script_name, frame)
                if args.reverse:
                    if roi_tanh_frame is None:
                        cv2.destroyWindow('ROI-Tanh')
                    else:
                        cv2.imshow('ROI-Tanh', roi_tanh_frame)
                    if roi_tanh_polar_frame is None:
                        cv2.destroyWindow('ROI-Tanh-Polar')
                    else:
                        cv2.imshow('ROI-Tanh-Polar', roi_tanh_polar_frame)
                else:
                    if roi_tanh_polar_frame is None:
                        cv2.destroyWindow('ROI-Tanh-Polar')
                    else:
                        cv2.imshow('ROI-Tanh-Polar', roi_tanh_polar_frame)
                    if roi_tanh_frame is None:
                        cv2.destroyWindow('ROI-Tanh')
                    else:
                        cv2.imshow('ROI-Tanh', roi_tanh_frame)
                if args.restore:
                    if restored_frame is None:
                        cv2.destroyWindow('Restored')
                    else:
                        cv2.imshow('Restored', restored_frame)
                if args.compare_direct:
                    if diff_direct is None:
                        cv2.destroyWindow('Diff-w-Direct')
                    else:
                        cv2.imshow('Diff-w-Direct', diff_direct)
                if args.compare:
                    if diff_ref is None:
                        cv2.destroyWindow('Diff-w-Ref')
                    else:
                        cv2.imshow('Diff-w-Ref', diff_ref)
                key = cv2.waitKey(1) % 2**16
                if key == ord('q') or key == ord('Q'):
                    print("\'Q\' pressed, we are done here.")
                    break
                else:
                    frame_number += 1
    finally:
        cv2.destroyAllWindows()
        vid.release()
        print('We are done here.')
def main():
    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        help='Input video path or webcam index',
                        default=0)
    parser.add_argument('--output',
                        '-o',
                        help='Output file path',
                        default=None)
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-display',
                        '-n',
                        help='No display if processing a video file',
                        action='store_true',
                        default=False)
    parser.add_argument('--threshold',
                        '-t',
                        help='Confidence threshold (default=0.8)',
                        type=float,
                        default=0.8)
    parser.add_argument(
        '--method',
        '-m',
        help=
        'Method to use, can be either RatinaFace or S3FD (default=RatinaFace)',
        default='retinaface')
    parser.add_argument(
        '--weights',
        '-w',
        help=
        'Weights to load, can be either resnet50 or mobilenet0.25 when using RetinaFace',
        default=None)
    parser.add_argument('--device',
                        '-d',
                        help='Device to be used by the model (default=cuda:0)',
                        default='cuda:0')
    args = parser.parse_args()

    # Set benchmark mode flag for CUDNN
    torch.backends.cudnn.benchmark = args.benchmark

    vid = None
    out_vid = None
    has_window = False
    try:
        # Create the face detector
        args.method = args.method.lower()
        if args.method == 'retinaface':
            face_detector = RetinaFacePredictor(
                threshold=args.threshold,
                device=args.device,
                model=(RetinaFacePredictor.get_model(args.weights)
                       if args.weights else None))
            print('Face detector created using RetinaFace.')
        elif args.method == 's3fd':
            face_detector = S3FDPredictor(
                threshold=args.threshold,
                device=args.device,
                model=(S3FDPredictor.get_model(args.weights)
                       if args.weights else None))
            print('Face detector created using S3FD.')
        else:
            raise ValueError('method must be set to either RetinaFace or S3FD')

        # Open the input video
        using_webcam = not os.path.exists(args.input)
        vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
        assert vid.isOpened()
        if using_webcam:
            print(f'Webcam #{int(args.input)} opened.')
        else:
            print(f'Input video "{args.input}" opened.')

        # Open the output video (if a path is given)
        if args.output is not None:
            out_vid = cv2.VideoWriter(
                args.output,
                apiPreference=cv2.CAP_FFMPEG,
                fps=vid.get(cv2.CAP_PROP_FPS),
                frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
                           int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))),
                fourcc=cv2.VideoWriter_fourcc('m', 'p', '4', 'v'))

        # Process the frames
        frame_number = 0
        window_title = os.path.splitext(os.path.basename(__file__))[0]
        print('Processing started, press \'Q\' to quit.')
        while True:
            # Get a new frame
            _, frame = vid.read()
            if frame is None:
                break
            else:
                # Detect faces
                start_time = time.time()
                faces = face_detector(frame, rgb=False)
                elapsed_time = time.time() - start_time

                # Textural output
                print(
                    f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: '
                    + f'{len(faces)} faces detected.')

                # Rendering
                for face in faces:
                    bbox = face[:4].astype(int)
                    cv2.rectangle(frame, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]),
                                  color=(0, 0, 255),
                                  thickness=2)
                    if len(face) > 5:
                        for pts in face[5:].reshape((-1, 2)):
                            cv2.circle(frame, tuple(pts.astype(int).tolist()),
                                       3, (0, 0, 255), -1)

                # Write the frame to output video (if recording)
                if out_vid is not None:
                    out_vid.write(frame)

                # Display the frame
                if using_webcam or not args.no_display:
                    has_window = True
                    cv2.imshow(window_title, frame)
                    key = cv2.waitKey(1) % 2**16
                    if key == ord('q') or key == ord('Q'):
                        print('\'Q\' pressed, we are done here.')
                        break
                frame_number += 1
    finally:
        if has_window:
            cv2.destroyAllWindows()
        if out_vid is not None:
            out_vid.release()
        if vid is not None:
            vid.release()
        print('All done.')
def main() -> None:
    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        help='Input video path or webcam index (default=0)',
                        default=0)
    parser.add_argument('--output',
                        '-o',
                        help='Output file path',
                        default=None)
    parser.add_argument('--fourcc',
                        '-f',
                        help='FourCC of the output video (default=mp4v)',
                        type=str,
                        default='mp4v')
    parser.add_argument('--benchmark',
                        '-b',
                        help='Enable benchmark mode for CUDNN',
                        action='store_true',
                        default=False)
    parser.add_argument('--no-display',
                        help='No display if processing a video file',
                        action='store_true',
                        default=False)
    parser.add_argument('--threshold',
                        '-t',
                        help='Detection threshold (default=0.8)',
                        type=float,
                        default=0.8)
    parser.add_argument(
        '--encoder',
        '-e',
        help=
        'Method to use, can be either rtnet50 or rtnet101 (default=rtnet50)',
        default='rtnet50')  # choices=['rtnet50', 'rtnet101', 'resnet50'])

    parser.add_argument(
        '--decoder',
        help=
        'Method to use, can be either rtnet50 or rtnet101 (default=rtnet50)',
        default='fcn',
        choices=['fcn', 'deeplabv3plus'])
    parser.add_argument('-n',
                        '--num-classes',
                        help='Face parsing classes (default=11)',
                        type=int,
                        default=11)
    parser.add_argument('--max-num-faces',
                        help='Max number of faces',
                        default=50)
    parser.add_argument(
        '--weights',
        '-w',
        help=
        'Weights to load, can be either resnet50 or mobilenet0.25 when using RetinaFace',
        default=None)
    parser.add_argument('--device',
                        '-d',
                        help='Device to be used by the model (default=cuda:0)',
                        default='cuda:0')
    args = parser.parse_args()

    # Set benchmark mode flag for CUDNN
    torch.backends.cudnn.benchmark = args.benchmark
    # args.method = args.method.lower().strip()
    vid = None
    out_vid = None
    has_window = False
    face_detector = RetinaFacePredictor(
        threshold=args.threshold,
        device=args.device,
        model=(RetinaFacePredictor.get_model('mobilenet0.25')))
    face_parser = RTNetPredictor(device=args.device,
                                 ckpt=args.weights,
                                 encoder=args.encoder,
                                 decoder=args.decoder,
                                 num_classes=args.num_classes)

    colormap = label_colormap(args.num_classes)
    print('Face detector created using RetinaFace.')
    try:
        # Open the input video
        using_webcam = not os.path.exists(args.input)
        vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
        assert vid.isOpened()
        alphas = np.linspace(0.75, 0.25, num=args.max_num_faces)
        if using_webcam:
            print(f'Webcam #{int(args.input)} opened.')
        else:
            print(f'Input video "{args.input}" opened.')

        # Open the output video (if a path is given)
        if args.output is not None:
            out_vid = cv2.VideoWriter(
                args.output,
                fps=vid.get(cv2.CAP_PROP_FPS),
                frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
                           int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))),
                fourcc=cv2.VideoWriter_fourcc(*args.fourcc))
            assert out_vid.isOpened()

        # Process the frames
        frame_number = 0
        window_title = os.path.splitext(os.path.basename(__file__))[0]
        print('Processing started, press \'Q\' to quit.')
        while True:
            # Get a new frame
            _, frame = vid.read()
            if frame is None:
                break
            else:
                # Detect faces
                start_time = time.time()
                faces = face_detector(frame, rgb=False)
                elapsed_time = time.time() - start_time

                # Textural output
                print(
                    f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: '
                    + f'{len(faces)} faces detected.')

                if len(faces) == 0:
                    continue
                # Parse faces
                start_time = time.time()
                masks = face_parser.predict_img(frame, faces, rgb=False)
                elapsed_time = time.time() - start_time

                # Textural output
                print(
                    f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: '
                    + f'{len(masks)} faces parsed.')

                # # Rendering
                dst = frame
                for i, (face, mask) in enumerate(zip(faces, masks)):
                    bbox = face[:4].astype(int)
                    cv2.rectangle(frame, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]),
                                  color=(0, 0, 255),
                                  thickness=2)
                    alpha = alphas[i]
                    index = mask > 0
                    res = colormap[mask]
                    dst[index] = (1 - alpha) * frame[index].astype(float) + \
                        alpha * res[index].astype(float)
                dst = np.clip(dst.round(), 0, 255).astype(np.uint8)
                frame = dst
                # Write the frame to output video (if recording)
                if out_vid is not None:
                    out_vid.write(frame)

                # Display the frame
                if using_webcam or not args.no_display:
                    has_window = True
                    cv2.imshow(window_title, frame)
                    key = cv2.waitKey(1) % 2**16
                    if key == ord('q') or key == ord('Q'):
                        print('\'Q\' pressed, we are done here.')
                        break
                frame_number += 1
    finally:
        if has_window:
            cv2.destroyAllWindows()
        if out_vid is not None:
            out_vid.release()
        if vid is not None:
            vid.release()
        print('All done.')