Esempio n. 1
0
def pred_to_outputs(cfg, output, inp_shape, iou_th=0.4, score_th=0.02):

    bbox_regressions, landm_regressions, classifications = output

    # only for batch size 1
    preds = tf.concat(  # [bboxes, landms, landms_valid, conf]
        [
            bbox_regressions[0], landm_regressions[0],
            tf.ones_like(classifications[0, :, 0][..., tf.newaxis]),
            classifications[0, :, 1][..., tf.newaxis]
        ], 1)
    priors = prior_box_tf((inp_shape[1], inp_shape[2]), cfg['min_sizes'],
                          cfg['steps'], cfg['clip'])
    decode_preds = decode_tf(preds, priors, cfg['variances'])

    selected_indices = tf.image.non_max_suppression(
        boxes=decode_preds[:, :4],
        scores=decode_preds[:, -1],
        max_output_size=tf.shape(decode_preds)[0],
        iou_threshold=iou_th,
        score_threshold=score_th)

    out = tf.gather(decode_preds, selected_indices)

    return out
Esempio n. 2
0
def RetinaFaceModel(cfg, training=False, iou_th=0.4, score_th=0.02, name="RetinaFaceModel"):
    """Retina Face Model"""
    input_size = cfg["input_size"] if training else None
    wd = cfg["weights_decay"]
    out_ch = cfg["out_channel"]
    num_anchor = len(cfg["min_sizes"][0])
    backbone_type = cfg["backbone_type"]

    # define model
    x = inputs = Input([input_size, input_size, 3], name="input_image")
    print(x.shape)

    x = Backbone(backbone_type=backbone_type)(x)

    fpn = FPN(out_ch=out_ch, wd=wd)(x)

    features = [SSH(out_ch=out_ch, wd=wd, name=f"SSH_{i}")(f) for i, f in enumerate(fpn)]

    bbox_regressions = tf.concat(
        [BboxHead(num_anchor, wd=wd, name=f"BboxHead_{i}")(f) for i, f in enumerate(features)], axis=1
    )
    landm_regressions = tf.concat(
        [LandmarkHead(num_anchor, wd=wd, name=f"LandmarkHead_{i}")(f) for i, f in enumerate(features)], axis=1
    )
    classifications = tf.concat(
        [ClassHead(num_anchor, wd=wd, name=f"ClassHead_{i}")(f) for i, f in enumerate(features)], axis=1
    )

    classifications = tf.keras.layers.Softmax(axis=-1)(classifications)

    if training:
        out = (bbox_regressions, landm_regressions, classifications)
    else:
        # only for batch size 1
        preds = tf.concat(  # [bboxes, landms, landms_valid, conf]
            [
                bbox_regressions[0],
                landm_regressions[0],
                tf.ones_like(classifications[0, :, 0][..., tf.newaxis]),
                classifications[0, :, 1][..., tf.newaxis],
            ],
            1,
        )
        priors = prior_box_tf(
            (tf.shape(inputs)[1], tf.shape(inputs)[2]), cfg["min_sizes"], cfg["steps"], cfg["clip"]
        )
        decode_preds = decode_tf(preds, priors, cfg["variances"])

        selected_indices = tf.image.non_max_suppression(
            boxes=decode_preds[:, :4],
            scores=decode_preds[:, -1],
            max_output_size=tf.shape(decode_preds)[0],
            iou_threshold=iou_th,
            score_threshold=score_th,
        )

        out = tf.gather(decode_preds, selected_indices)

    return Model(inputs, out, name=name)
Esempio n. 3
0
def main(_argv):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    model = RetinaFaceModel(cfg,
                            training=False,
                            iou_th=FLAGS.iou_th,
                            score_th=FLAGS.score_th)

    # load model from weights.h5
    # model.load_weights('./model/mbv2_weights.h5', by_name=True, skip_mismatch=True)

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(model=model)
    if tf.train.latest_checkpoint(checkpoint_dir):
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
        print("[*] load ckpt from {}.".format(
            tf.train.latest_checkpoint(checkpoint_dir)))
    else:
        print("[*] Cannot find ckpt from {}.".format(checkpoint_dir))
        exit()

    if not FLAGS.webcam:
        file_path = '/Users/lichaochao/Downloads/images_UMU/'
        for file_name in os.listdir(file_path + 'source_images/'):
            image_path = file_path + 'source_images/' + file_name
            if not os.path.exists(image_path):
                print(f"cannot find image path from {image_path}")
                continue

            img_raw = cv2.imread(image_path)
            img = np.float32(img_raw.copy())

            # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # pad input image to avoid unmatched shape problem
            img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
            img_height, img_width, _ch = img.shape

            # run model
            outputs = model(img[np.newaxis, ...])

            preds = tf.concat([
                outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis],
                outputs[2][0, :, 1][..., tf.newaxis]
            ], -1)

            priors = prior_box_tf((img_height, img_width), cfg['min_sizes'],
                                  cfg['steps'], cfg['clip'])
            decode_preds = decode_tf(preds, priors, cfg['variances'])

            selected_indices = tf.image.non_max_suppression(
                boxes=decode_preds[:, :4],
                scores=decode_preds[:, -1],
                max_output_size=tf.shape(decode_preds)[0],
                iou_threshold=FLAGS.iou_th,
                score_threshold=FLAGS.score_th)

            outputs = tf.gather(decode_preds, selected_indices).numpy()

            # recover padding effect
            outputs = recover_pad_output(outputs, pad_params)
            has_face = False
            is_smile = False
            for prior_index in range(len(outputs)):
                ann = outputs[prior_index]
                if ann[-1] >= 0.5:
                    has_face = True
                    x1, y1 = int(ann[0] * img_width), int(ann[1] * img_height)
                    x2, y2 = int(ann[2] * img_width), int(ann[3] * img_height)

                    text = "face: {:.2f}".format(ann[-1] * 100)
                    cv2.putText(img, text, (x1 + 5, y1 - 10),
                                cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))

                    if ann[-2] >= 0.5:
                        is_smile = True
                        smile_text = "smile: {:.2f}".format(ann[-2] * 100)
                        cv2.putText(img, smile_text, (x1 + 5, y1 + 30),
                                    cv2.FONT_HERSHEY_DUPLEX, 0.5,
                                    (255, 255, 255))
                        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
                    else:
                        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
            if is_smile:
                dst_file_path = file_path + '/smile_face/' + file_name
            elif has_face:
                dst_file_path = file_path + '/face/' + file_name
            else:
                dst_file_path = file_path + '/no_face/' + file_name
            cv2.imwrite(dst_file_path, img)
            print(dst_file_path)

    else:
        cam = cv2.VideoCapture('./data/linda_umu.mp4')
        # cam.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
        # cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
        resize = FLAGS.down_scale_factor
        frame_height = cam.get(cv2.CAP_PROP_FRAME_HEIGHT) * resize
        frame_width = cam.get(cv2.CAP_PROP_FRAME_WIDTH) * resize

        max_steps = max(cfg['steps'])
        img_pad_h = max_steps - frame_height % max_steps if frame_height % max_steps > 0 else 0
        img_pad_w = max_steps - frame_width % max_steps if frame_width % max_steps > 0 else 0
        priors = prior_box_tf(
            (frame_height + img_pad_h, frame_width + img_pad_w),
            cfg['min_sizes'], cfg['steps'], cfg['clip'])

        frame_index = 0
        outputs = []
        start_time = time.time()
        while cam.isOpened():
            _, frame = cam.read()
            if frame is None:
                print('no cam')
                break
            if frame_index < 5:
                frame_index += 1
                # continue
            else:
                frame_index = 0

                img = np.float32(frame.copy())
                if resize < 1:
                    img = cv2.resize(img, (0, 0),
                                     fx=resize,
                                     fy=resize,
                                     interpolation=cv2.INTER_LINEAR)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                # pad input image to avoid unmatched shape problem
                img, pad_params = pad_input_image(img, max_steps=max_steps)

                # run model
                outputs = model(img[np.newaxis, ...])

                preds = tf.concat([
                    outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis],
                    outputs[2][0, :, 1][..., tf.newaxis]
                ], -1)

                decode_preds = decode_tf(preds, priors, cfg['variances'])

                selected_indices = tf.image.non_max_suppression(
                    boxes=decode_preds[:, :4],
                    scores=decode_preds[:, -1],
                    max_output_size=tf.shape(decode_preds)[0],
                    iou_threshold=FLAGS.iou_th,
                    score_threshold=FLAGS.score_th)

                outputs = tf.gather(decode_preds, selected_indices).numpy()

                # recover padding effect
                outputs = recover_pad_output(outputs,
                                             pad_params,
                                             resize=resize)

                # calculate fps
                fps_str = "FPS: %.2f" % (1 / (time.time() - start_time))
                start_time = time.time()
                cv2.putText(frame, fps_str, (25, 50), cv2.FONT_HERSHEY_DUPLEX,
                            0.75, (0, 0, 255), 2)

            # draw results
            for prior_index in range(len(outputs)):
                draw_bbox_landm(frame, outputs[prior_index], frame_height,
                                frame_width)

            # calculate fps
            # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time))
            # start_time = time.time()
            # cv2.putText(frame, fps_str, (25, 25),
            #             cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)

            # show frame
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) == ord('q'):
                exit()
Esempio n. 4
0
    using_flip=True, using_distort=False, using_encoding=using_encoding,
    priors=priors, match_thresh=match_thresh, ignore_thresh=ignore_thresh,
    variances=variances, shuffle=False)

start_time = time.time()
for idx, (inputs, labels) in enumerate(train_dataset.take(num_samples)):
    print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape)

    if not visualization:
        continue

    img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8)
    if not using_encoding:
        # labels includes loc, landm, landm_valid.
        targets = labels.numpy()[0]
        for target in targets:
            draw_bbox_landm(img, target, img_dim, img_dim)
    else:
        # labels includes loc, landm, landm_valid, conf.
        targets = decode_tf(labels[0], priors, variances=variances).numpy()
        for prior_index in range(len(targets)):
            if targets[prior_index][-1] == 1:
                draw_bbox_landm(img, targets[prior_index], img_dim, img_dim)
                draw_anchor(img, priors[prior_index], img_dim, img_dim)

    cv2.imshow('img', cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    if cv2.waitKey(0) == ord('q'):
        exit()

print("data fps: {:.2f}".format(num_samples / (time.time() - start_time)))
Esempio n. 5
0
def main(_):

    min_sizes = [[16, 32], [64, 128], [256, 512]]
    steps = [8, 16, 32]
    clip = False

    img_dim = 640
    priors = prior_box((img_dim, img_dim), min_sizes, steps, clip)

    variances = [0.1, 0.2]
    match_thresh = 0.45
    ignore_thresh = 0.3
    batch_size = 1
    shuffle = True
    using_flip = True
    using_distort = True
    using_bin = True
    buffer_size = 4000
    number_cycles = 2
    threads = 2

    check_dataset = load_tfrecord_dataset(dataset_root=FLAGS.dataset_path,
                                          split=FLAGS.split,
                                          threads=threads,
                                          number_cycles=number_cycles,
                                          batch_size=batch_size,
                                          hvd=[],
                                          img_dim=img_dim,
                                          using_bin=using_bin,
                                          using_flip=using_flip,
                                          using_distort=using_distort,
                                          using_encoding=FLAGS.using_encoding,
                                          priors=priors,
                                          match_thresh=match_thresh,
                                          ignore_thresh=ignore_thresh,
                                          variances=variances,
                                          shuffle=shuffle,
                                          buffer_size=buffer_size)

    time.time()
    for idx, (inputs, labels, _) in enumerate(check_dataset):
        print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape)

        if not FLAGS.visualization:
            continue

        img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8)
        if not FLAGS.using_encoding:
            # labels includes loc, landm, landm_valid.
            targets = labels.numpy()[0]
            for target in targets:
                draw_bbox_landm(img, target, img_dim, img_dim)
        else:
            # labels includes loc, landm, landm_valid, conf.
            targets = decode_tf(labels[0], priors, variances=variances).numpy()
            for prior_index in range(len(targets)):
                if targets[prior_index][-1] != 1:
                    continue

                draw_bbox_landm(img, targets[prior_index], img_dim, img_dim)
                draw_anchor(img, priors[prior_index], img_dim, img_dim)

        cv2.imwrite('{}/{}.png'.format(FLAGS.output_path, str(idx)),
                    img[:, :, ::-1])
Esempio n. 6
0
def main(_argv):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    model = RetinaFaceModel(cfg,
                            training=False,
                            iou_th=FLAGS.iou_th,
                            score_th=FLAGS.score_th)

    # load model from weights.h5
    # model.load_weights('./model/mbv2_weights.h5', by_name=True, skip_mismatch=True)

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(model=model)
    if tf.train.latest_checkpoint(checkpoint_dir):
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
        print("[*] load ckpt from {}.".format(
            tf.train.latest_checkpoint(checkpoint_dir)))
    else:
        print("[*] Cannot find ckpt from {}.".format(checkpoint_dir))
        exit()

    if not FLAGS.webcam:
        if not os.path.exists(FLAGS.img_path):
            print(f"cannot find image path from {FLAGS.img_path}")
            exit()

        print("[*] Processing on single image {}".format(FLAGS.img_path))

        img_raw = cv2.imread(FLAGS.img_path)
        img = np.float32(img_raw.copy())

        # testing scale
        target_size = 320
        img_size_max = np.max(img.shape[0:2])
        resize = float(target_size) / float(img_size_max)
        img = cv2.resize(img,
                         None,
                         None,
                         fx=resize,
                         fy=resize,
                         interpolation=cv2.INTER_LINEAR)

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # pad input image to avoid unmatched shape problem
        img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))

        # run model
        outputs = model(img[np.newaxis, ...]).numpy()

        # recover padding effect
        outputs = recover_pad_output(outputs, pad_params)

        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # draw and save results
        save_img_path = os.path.join('out_' + os.path.basename(FLAGS.img_path))
        for prior_index in range(len(outputs)):
            draw_bbox_landm(img, outputs[prior_index], target_size,
                            target_size)
        cv2.imwrite(save_img_path, img)
        print(f"[*] save result at {save_img_path}")

    else:
        cam = cv2.VideoCapture('./data/lichaochao.mp4')
        # cam = cv2.VideoCapture(0)
        frame_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))
        frame_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))

        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        fps = cam.get(cv2.CAP_PROP_FPS)
        out = cv2.VideoWriter('chaochao1.mp4',
                              fourcc,
                              fps=fps,
                              frameSize=(frame_height, frame_width))

        resize = FLAGS.down_scale_factor
        frame_height *= resize
        frame_width *= resize

        max_steps = max(cfg['steps'])
        img_pad_h = max_steps - frame_height % max_steps if frame_height % max_steps > 0 else 0
        img_pad_w = max_steps - frame_width % max_steps if frame_width % max_steps > 0 else 0
        priors = prior_box_tf(
            (frame_height + img_pad_h, frame_width + img_pad_w),
            cfg['min_sizes'], cfg['steps'], cfg['clip'])

        frame_index = 0
        outputs = []
        start_time = time.time()
        while cam.isOpened():
            _, frame = cam.read()
            if frame is None:
                print('no cam')
                break
            if frame_index < 5:
                frame_index += 1
                # continue
            else:
                frame_index = 0

                img = np.float32(frame.copy())
                if resize < 1:
                    img = cv2.resize(img, (0, 0),
                                     fx=resize,
                                     fy=resize,
                                     interpolation=cv2.INTER_LINEAR)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                # pad input image to avoid unmatched shape problem
                img, pad_params = pad_input_image(img, max_steps=max_steps)

                # run model
                outputs = model(img[np.newaxis, ...])

                preds = tf.concat([
                    outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis],
                    outputs[2][0, :, 1][..., tf.newaxis]
                ], -1)

                decode_preds = decode_tf(preds, priors, cfg['variances'])

                selected_indices = tf.image.non_max_suppression(
                    boxes=decode_preds[:, :4],
                    scores=decode_preds[:, -1],
                    max_output_size=tf.shape(decode_preds)[0],
                    iou_threshold=FLAGS.iou_th,
                    score_threshold=FLAGS.score_th)

                outputs = tf.gather(decode_preds, selected_indices).numpy()

                # recover padding effect
                outputs = recover_pad_output(outputs,
                                             pad_params,
                                             resize=resize)

                # calculate fps
                # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time))
                # start_time = time.time()
                # cv2.putText(frame, fps_str, (25, 50),
                #             cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 0, 255), 2)

            # draw results
            for prior_index in range(len(outputs)):
                draw_bbox_landm(frame, outputs[prior_index], frame_height,
                                frame_width)

            # calculate fps
            # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time))
            # start_time = time.time()
            # cv2.putText(frame, fps_str, (25, 25),
            #             cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)

            # show frame
            out.write(frame)
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) == ord('q'):
                exit()
Esempio n. 7
0
def RetinaFaceModel(cfg,
                    training=False,
                    iou_th=0.4,
                    score_th=0.02,
                    name='RetinaFaceModel'):
    """Retina Face Model"""
    input_size = cfg['input_size'] if training else None

    wd = cfg['weights_decay']
    out_ch = cfg['out_channel']
    num_anchor = len(cfg['min_sizes'][0])
    backbone_type = cfg['backbone_type']

    # define model
    x = inputs = Input([input_size, input_size, 3], name='input_image')

    x = Backbone(backbone_type=backbone_type)(x)

    fpn = FPN(out_ch=out_ch, wd=wd)(x)

    features = [
        SSH(out_ch=out_ch, wd=wd,
            name=f'SSH_{i}')(f)  #output 256 sau concon cua context(SSH)
        for i, f in enumerate(fpn)
    ]

    bbox_regressions = tf.concat([
        BboxHead(num_anchor, wd=wd, name=f'BboxHead_{i}')(f)
        for i, f in enumerate(features)
    ],
                                 axis=1)
    landm_regressions = tf.concat([
        LandmarkHead(num_anchor, wd=wd, name=f'LandmarkHead_{i}')(f)
        for i, f in enumerate(features)
    ],
                                  axis=1)
    classifications = tf.concat([
        ClassHead(num_anchor, wd=wd, name=f'ClassHead_{i}')(f)
        for i, f in enumerate(features)
    ],
                                axis=1)

    classifications = tf.keras.layers.Softmax(axis=-1)(classifications)

    if training:
        out = (bbox_regressions, landm_regressions, classifications)
    else:
        # only for batch size 1
        preds = tf.concat(  # [bboxes, landms, landms_valid, conf]
            [
                bbox_regressions[0], landm_regressions[0],
                tf.ones_like(classifications[0, :, 0][..., tf.newaxis]),
                classifications[0, :, 1][..., tf.newaxis]
            ], 1)
        priors = prior_box_tf((tf.shape(inputs)[1], tf.shape(inputs)[2]),
                              cfg['min_sizes'], cfg['steps'], cfg['clip'])
        decode_preds = decode_tf(preds, priors, cfg['variances'])

        selected_indices = tf.image.non_max_suppression(
            boxes=decode_preds[:, :4],
            scores=decode_preds[:, -1],
            max_output_size=tf.shape(decode_preds)[0],
            iou_threshold=iou_th,
            score_threshold=score_th)

        out = tf.gather(decode_preds, selected_indices)

    return Model(inputs, out, name=name)
def main(_):
    min_sizes = [[16, 32], [64, 128], [256, 512]]
    steps = [8, 16, 32]
    clip = False

    img_dim = 640
    priors = prior_box((img_dim, img_dim), min_sizes, steps, clip)

    variances = [0.1, 0.2]
    match_thresh = 0.45
    ignore_thresh = 0.3
    num_samples = 100

    if FLAGS.using_encoding:
        assert FLAGS.batch_size == 1

    if FLAGS.using_bin:
        tfrecord_name = './data/widerface_train_bin.tfrecord'
    else:
        tfrecord_name = './data/widerface_train.tfrecord'

    train_dataset = load_tfrecord_dataset(tfrecord_name,
                                          FLAGS.batch_size,
                                          img_dim=640,
                                          using_bin=FLAGS.using_bin,
                                          using_flip=True,
                                          using_distort=False,
                                          using_encoding=FLAGS.using_encoding,
                                          priors=priors,
                                          match_thresh=match_thresh,
                                          ignore_thresh=ignore_thresh,
                                          variances=variances,
                                          shuffle=False)

    start_time = time.time()
    for idx, (inputs, labels) in enumerate(train_dataset.take(num_samples)):
        print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape)

        if not FLAGS.visualization:
            continue

        img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8)
        if not FLAGS.using_encoding:
            # labels includes loc, landm, landm_valid.
            targets = labels.numpy()[0]
            for target in targets:
                draw_bbox_landm(img, target, img_dim, img_dim)
        else:
            # labels includes loc, landm, landm_valid, conf.
            targets = decode_tf(labels[0], priors, variances=variances).numpy()
            for prior_index in range(len(targets)):
                if targets[prior_index][-1] != 1:
                    continue

                draw_bbox_landm(img, targets[prior_index], img_dim, img_dim)
                draw_anchor(img, priors[prior_index], img_dim, img_dim)

        cv2.imshow('img', cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
        if cv2.waitKey(0) == ord('q'):
            exit()

    print("data fps: {:.2f}".format(num_samples / (time.time() - start_time)))