def pred_to_outputs(cfg, output, inp_shape, iou_th=0.4, score_th=0.02): bbox_regressions, landm_regressions, classifications = output # only for batch size 1 preds = tf.concat( # [bboxes, landms, landms_valid, conf] [ bbox_regressions[0], landm_regressions[0], tf.ones_like(classifications[0, :, 0][..., tf.newaxis]), classifications[0, :, 1][..., tf.newaxis] ], 1) priors = prior_box_tf((inp_shape[1], inp_shape[2]), cfg['min_sizes'], cfg['steps'], cfg['clip']) decode_preds = decode_tf(preds, priors, cfg['variances']) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=iou_th, score_threshold=score_th) out = tf.gather(decode_preds, selected_indices) return out
def RetinaFaceModel(cfg, training=False, iou_th=0.4, score_th=0.02, name="RetinaFaceModel"): """Retina Face Model""" input_size = cfg["input_size"] if training else None wd = cfg["weights_decay"] out_ch = cfg["out_channel"] num_anchor = len(cfg["min_sizes"][0]) backbone_type = cfg["backbone_type"] # define model x = inputs = Input([input_size, input_size, 3], name="input_image") print(x.shape) x = Backbone(backbone_type=backbone_type)(x) fpn = FPN(out_ch=out_ch, wd=wd)(x) features = [SSH(out_ch=out_ch, wd=wd, name=f"SSH_{i}")(f) for i, f in enumerate(fpn)] bbox_regressions = tf.concat( [BboxHead(num_anchor, wd=wd, name=f"BboxHead_{i}")(f) for i, f in enumerate(features)], axis=1 ) landm_regressions = tf.concat( [LandmarkHead(num_anchor, wd=wd, name=f"LandmarkHead_{i}")(f) for i, f in enumerate(features)], axis=1 ) classifications = tf.concat( [ClassHead(num_anchor, wd=wd, name=f"ClassHead_{i}")(f) for i, f in enumerate(features)], axis=1 ) classifications = tf.keras.layers.Softmax(axis=-1)(classifications) if training: out = (bbox_regressions, landm_regressions, classifications) else: # only for batch size 1 preds = tf.concat( # [bboxes, landms, landms_valid, conf] [ bbox_regressions[0], landm_regressions[0], tf.ones_like(classifications[0, :, 0][..., tf.newaxis]), classifications[0, :, 1][..., tf.newaxis], ], 1, ) priors = prior_box_tf( (tf.shape(inputs)[1], tf.shape(inputs)[2]), cfg["min_sizes"], cfg["steps"], cfg["clip"] ) decode_preds = decode_tf(preds, priors, cfg["variances"]) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=iou_th, score_threshold=score_th, ) out = tf.gather(decode_preds, selected_indices) return Model(inputs, out, name=name)
def main(_argv): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network model = RetinaFaceModel(cfg, training=False, iou_th=FLAGS.iou_th, score_th=FLAGS.score_th) # load model from weights.h5 # model.load_weights('./model/mbv2_weights.h5', by_name=True, skip_mismatch=True) # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(model=model) if tf.train.latest_checkpoint(checkpoint_dir): checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) print("[*] load ckpt from {}.".format( tf.train.latest_checkpoint(checkpoint_dir))) else: print("[*] Cannot find ckpt from {}.".format(checkpoint_dir)) exit() if not FLAGS.webcam: file_path = '/Users/lichaochao/Downloads/images_UMU/' for file_name in os.listdir(file_path + 'source_images/'): image_path = file_path + 'source_images/' + file_name if not os.path.exists(image_path): print(f"cannot find image path from {image_path}") continue img_raw = cv2.imread(image_path) img = np.float32(img_raw.copy()) # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # pad input image to avoid unmatched shape problem img, pad_params = pad_input_image(img, max_steps=max(cfg['steps'])) img_height, img_width, _ch = img.shape # run model outputs = model(img[np.newaxis, ...]) preds = tf.concat([ outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis], outputs[2][0, :, 1][..., tf.newaxis] ], -1) priors = prior_box_tf((img_height, img_width), cfg['min_sizes'], cfg['steps'], cfg['clip']) decode_preds = decode_tf(preds, priors, cfg['variances']) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=FLAGS.iou_th, score_threshold=FLAGS.score_th) outputs = tf.gather(decode_preds, selected_indices).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params) has_face = False is_smile = False for prior_index in range(len(outputs)): ann = outputs[prior_index] if ann[-1] >= 0.5: has_face = True x1, y1 = int(ann[0] * img_width), int(ann[1] * img_height) x2, y2 = int(ann[2] * img_width), int(ann[3] * img_height) text = "face: {:.2f}".format(ann[-1] * 100) cv2.putText(img, text, (x1 + 5, y1 - 10), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) if ann[-2] >= 0.5: is_smile = True smile_text = "smile: {:.2f}".format(ann[-2] * 100) cv2.putText(img, smile_text, (x1 + 5, y1 + 30), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2) else: cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2) if is_smile: dst_file_path = file_path + '/smile_face/' + file_name elif has_face: dst_file_path = file_path + '/face/' + file_name else: dst_file_path = file_path + '/no_face/' + file_name cv2.imwrite(dst_file_path, img) print(dst_file_path) else: cam = cv2.VideoCapture('./data/linda_umu.mp4') # cam.set(cv2.CAP_PROP_FRAME_WIDTH, 640) # cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) resize = FLAGS.down_scale_factor frame_height = cam.get(cv2.CAP_PROP_FRAME_HEIGHT) * resize frame_width = cam.get(cv2.CAP_PROP_FRAME_WIDTH) * resize max_steps = max(cfg['steps']) img_pad_h = max_steps - frame_height % max_steps if frame_height % max_steps > 0 else 0 img_pad_w = max_steps - frame_width % max_steps if frame_width % max_steps > 0 else 0 priors = prior_box_tf( (frame_height + img_pad_h, frame_width + img_pad_w), cfg['min_sizes'], cfg['steps'], cfg['clip']) frame_index = 0 outputs = [] start_time = time.time() while cam.isOpened(): _, frame = cam.read() if frame is None: print('no cam') break if frame_index < 5: frame_index += 1 # continue else: frame_index = 0 img = np.float32(frame.copy()) if resize < 1: img = cv2.resize(img, (0, 0), fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # pad input image to avoid unmatched shape problem img, pad_params = pad_input_image(img, max_steps=max_steps) # run model outputs = model(img[np.newaxis, ...]) preds = tf.concat([ outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis], outputs[2][0, :, 1][..., tf.newaxis] ], -1) decode_preds = decode_tf(preds, priors, cfg['variances']) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=FLAGS.iou_th, score_threshold=FLAGS.score_th) outputs = tf.gather(decode_preds, selected_indices).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params, resize=resize) # calculate fps fps_str = "FPS: %.2f" % (1 / (time.time() - start_time)) start_time = time.time() cv2.putText(frame, fps_str, (25, 50), cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 0, 255), 2) # draw results for prior_index in range(len(outputs)): draw_bbox_landm(frame, outputs[prior_index], frame_height, frame_width) # calculate fps # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time)) # start_time = time.time() # cv2.putText(frame, fps_str, (25, 25), # cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2) # show frame cv2.imshow('frame', frame) if cv2.waitKey(1) == ord('q'): exit()
using_flip=True, using_distort=False, using_encoding=using_encoding, priors=priors, match_thresh=match_thresh, ignore_thresh=ignore_thresh, variances=variances, shuffle=False) start_time = time.time() for idx, (inputs, labels) in enumerate(train_dataset.take(num_samples)): print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape) if not visualization: continue img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8) if not using_encoding: # labels includes loc, landm, landm_valid. targets = labels.numpy()[0] for target in targets: draw_bbox_landm(img, target, img_dim, img_dim) else: # labels includes loc, landm, landm_valid, conf. targets = decode_tf(labels[0], priors, variances=variances).numpy() for prior_index in range(len(targets)): if targets[prior_index][-1] == 1: draw_bbox_landm(img, targets[prior_index], img_dim, img_dim) draw_anchor(img, priors[prior_index], img_dim, img_dim) cv2.imshow('img', cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) if cv2.waitKey(0) == ord('q'): exit() print("data fps: {:.2f}".format(num_samples / (time.time() - start_time)))
def main(_): min_sizes = [[16, 32], [64, 128], [256, 512]] steps = [8, 16, 32] clip = False img_dim = 640 priors = prior_box((img_dim, img_dim), min_sizes, steps, clip) variances = [0.1, 0.2] match_thresh = 0.45 ignore_thresh = 0.3 batch_size = 1 shuffle = True using_flip = True using_distort = True using_bin = True buffer_size = 4000 number_cycles = 2 threads = 2 check_dataset = load_tfrecord_dataset(dataset_root=FLAGS.dataset_path, split=FLAGS.split, threads=threads, number_cycles=number_cycles, batch_size=batch_size, hvd=[], img_dim=img_dim, using_bin=using_bin, using_flip=using_flip, using_distort=using_distort, using_encoding=FLAGS.using_encoding, priors=priors, match_thresh=match_thresh, ignore_thresh=ignore_thresh, variances=variances, shuffle=shuffle, buffer_size=buffer_size) time.time() for idx, (inputs, labels, _) in enumerate(check_dataset): print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape) if not FLAGS.visualization: continue img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8) if not FLAGS.using_encoding: # labels includes loc, landm, landm_valid. targets = labels.numpy()[0] for target in targets: draw_bbox_landm(img, target, img_dim, img_dim) else: # labels includes loc, landm, landm_valid, conf. targets = decode_tf(labels[0], priors, variances=variances).numpy() for prior_index in range(len(targets)): if targets[prior_index][-1] != 1: continue draw_bbox_landm(img, targets[prior_index], img_dim, img_dim) draw_anchor(img, priors[prior_index], img_dim, img_dim) cv2.imwrite('{}/{}.png'.format(FLAGS.output_path, str(idx)), img[:, :, ::-1])
def main(_argv): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network model = RetinaFaceModel(cfg, training=False, iou_th=FLAGS.iou_th, score_th=FLAGS.score_th) # load model from weights.h5 # model.load_weights('./model/mbv2_weights.h5', by_name=True, skip_mismatch=True) # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(model=model) if tf.train.latest_checkpoint(checkpoint_dir): checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) print("[*] load ckpt from {}.".format( tf.train.latest_checkpoint(checkpoint_dir))) else: print("[*] Cannot find ckpt from {}.".format(checkpoint_dir)) exit() if not FLAGS.webcam: if not os.path.exists(FLAGS.img_path): print(f"cannot find image path from {FLAGS.img_path}") exit() print("[*] Processing on single image {}".format(FLAGS.img_path)) img_raw = cv2.imread(FLAGS.img_path) img = np.float32(img_raw.copy()) # testing scale target_size = 320 img_size_max = np.max(img.shape[0:2]) resize = float(target_size) / float(img_size_max) img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # pad input image to avoid unmatched shape problem img, pad_params = pad_input_image(img, max_steps=max(cfg['steps'])) # run model outputs = model(img[np.newaxis, ...]).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # draw and save results save_img_path = os.path.join('out_' + os.path.basename(FLAGS.img_path)) for prior_index in range(len(outputs)): draw_bbox_landm(img, outputs[prior_index], target_size, target_size) cv2.imwrite(save_img_path, img) print(f"[*] save result at {save_img_path}") else: cam = cv2.VideoCapture('./data/lichaochao.mp4') # cam = cv2.VideoCapture(0) frame_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT)) frame_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH)) fourcc = cv2.VideoWriter_fourcc(*'XVID') fps = cam.get(cv2.CAP_PROP_FPS) out = cv2.VideoWriter('chaochao1.mp4', fourcc, fps=fps, frameSize=(frame_height, frame_width)) resize = FLAGS.down_scale_factor frame_height *= resize frame_width *= resize max_steps = max(cfg['steps']) img_pad_h = max_steps - frame_height % max_steps if frame_height % max_steps > 0 else 0 img_pad_w = max_steps - frame_width % max_steps if frame_width % max_steps > 0 else 0 priors = prior_box_tf( (frame_height + img_pad_h, frame_width + img_pad_w), cfg['min_sizes'], cfg['steps'], cfg['clip']) frame_index = 0 outputs = [] start_time = time.time() while cam.isOpened(): _, frame = cam.read() if frame is None: print('no cam') break if frame_index < 5: frame_index += 1 # continue else: frame_index = 0 img = np.float32(frame.copy()) if resize < 1: img = cv2.resize(img, (0, 0), fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # pad input image to avoid unmatched shape problem img, pad_params = pad_input_image(img, max_steps=max_steps) # run model outputs = model(img[np.newaxis, ...]) preds = tf.concat([ outputs[0][0], outputs[1][0, :, 1][..., tf.newaxis], outputs[2][0, :, 1][..., tf.newaxis] ], -1) decode_preds = decode_tf(preds, priors, cfg['variances']) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=FLAGS.iou_th, score_threshold=FLAGS.score_th) outputs = tf.gather(decode_preds, selected_indices).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params, resize=resize) # calculate fps # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time)) # start_time = time.time() # cv2.putText(frame, fps_str, (25, 50), # cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 0, 255), 2) # draw results for prior_index in range(len(outputs)): draw_bbox_landm(frame, outputs[prior_index], frame_height, frame_width) # calculate fps # fps_str = "FPS: %.2f" % (1 / (time.time() - start_time)) # start_time = time.time() # cv2.putText(frame, fps_str, (25, 25), # cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2) # show frame out.write(frame) cv2.imshow('frame', frame) if cv2.waitKey(1) == ord('q'): exit()
def RetinaFaceModel(cfg, training=False, iou_th=0.4, score_th=0.02, name='RetinaFaceModel'): """Retina Face Model""" input_size = cfg['input_size'] if training else None wd = cfg['weights_decay'] out_ch = cfg['out_channel'] num_anchor = len(cfg['min_sizes'][0]) backbone_type = cfg['backbone_type'] # define model x = inputs = Input([input_size, input_size, 3], name='input_image') x = Backbone(backbone_type=backbone_type)(x) fpn = FPN(out_ch=out_ch, wd=wd)(x) features = [ SSH(out_ch=out_ch, wd=wd, name=f'SSH_{i}')(f) #output 256 sau concon cua context(SSH) for i, f in enumerate(fpn) ] bbox_regressions = tf.concat([ BboxHead(num_anchor, wd=wd, name=f'BboxHead_{i}')(f) for i, f in enumerate(features) ], axis=1) landm_regressions = tf.concat([ LandmarkHead(num_anchor, wd=wd, name=f'LandmarkHead_{i}')(f) for i, f in enumerate(features) ], axis=1) classifications = tf.concat([ ClassHead(num_anchor, wd=wd, name=f'ClassHead_{i}')(f) for i, f in enumerate(features) ], axis=1) classifications = tf.keras.layers.Softmax(axis=-1)(classifications) if training: out = (bbox_regressions, landm_regressions, classifications) else: # only for batch size 1 preds = tf.concat( # [bboxes, landms, landms_valid, conf] [ bbox_regressions[0], landm_regressions[0], tf.ones_like(classifications[0, :, 0][..., tf.newaxis]), classifications[0, :, 1][..., tf.newaxis] ], 1) priors = prior_box_tf((tf.shape(inputs)[1], tf.shape(inputs)[2]), cfg['min_sizes'], cfg['steps'], cfg['clip']) decode_preds = decode_tf(preds, priors, cfg['variances']) selected_indices = tf.image.non_max_suppression( boxes=decode_preds[:, :4], scores=decode_preds[:, -1], max_output_size=tf.shape(decode_preds)[0], iou_threshold=iou_th, score_threshold=score_th) out = tf.gather(decode_preds, selected_indices) return Model(inputs, out, name=name)
def main(_): min_sizes = [[16, 32], [64, 128], [256, 512]] steps = [8, 16, 32] clip = False img_dim = 640 priors = prior_box((img_dim, img_dim), min_sizes, steps, clip) variances = [0.1, 0.2] match_thresh = 0.45 ignore_thresh = 0.3 num_samples = 100 if FLAGS.using_encoding: assert FLAGS.batch_size == 1 if FLAGS.using_bin: tfrecord_name = './data/widerface_train_bin.tfrecord' else: tfrecord_name = './data/widerface_train.tfrecord' train_dataset = load_tfrecord_dataset(tfrecord_name, FLAGS.batch_size, img_dim=640, using_bin=FLAGS.using_bin, using_flip=True, using_distort=False, using_encoding=FLAGS.using_encoding, priors=priors, match_thresh=match_thresh, ignore_thresh=ignore_thresh, variances=variances, shuffle=False) start_time = time.time() for idx, (inputs, labels) in enumerate(train_dataset.take(num_samples)): print("{} inputs:".format(idx), inputs.shape, "labels:", labels.shape) if not FLAGS.visualization: continue img = np.clip(inputs.numpy()[0], 0, 255).astype(np.uint8) if not FLAGS.using_encoding: # labels includes loc, landm, landm_valid. targets = labels.numpy()[0] for target in targets: draw_bbox_landm(img, target, img_dim, img_dim) else: # labels includes loc, landm, landm_valid, conf. targets = decode_tf(labels[0], priors, variances=variances).numpy() for prior_index in range(len(targets)): if targets[prior_index][-1] != 1: continue draw_bbox_landm(img, targets[prior_index], img_dim, img_dim) draw_anchor(img, priors[prior_index], img_dim, img_dim) cv2.imshow('img', cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) if cv2.waitKey(0) == ord('q'): exit() print("data fps: {:.2f}".format(num_samples / (time.time() - start_time)))