def main():
    """
    Starting point of the application
    """
    hvd.init()
    params = parse_args(PARSER.parse_args())
    set_flags(params)
    model_dir = prepare_model_dir(params)
    params.model_dir = model_dir
    logger = get_logger(params)

    model = Unet()

    dataset = Dataset(data_dir=params.data_dir,
                      batch_size=params.batch_size,
                      fold=params.crossvalidation_idx,
                      augment=params.augment,
                      gpu_id=hvd.rank(),
                      num_gpus=hvd.size(),
                      seed=params.seed)

    if 'train' in params.exec_mode:
        train(params, model, dataset, logger)

    if 'evaluate' in params.exec_mode:
        if hvd.rank() == 0:
            evaluate(params, model, dataset, logger)

    if 'predict' in params.exec_mode:
        if hvd.rank() == 0:
            predict(params, model, dataset, logger)
Exemple #2
0
def main(_):

    # get e2e training time
    begin = time.time()
    logging.info("Training started at: {}".format(time.asctime()))

    hvd.init()

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs
    if FLAGS.lr:
        config.learning_rate = FLAGS.lr
    if FLAGS.warmup_value:
        config.lr_warmup_init = FLAGS.warmup_value
    if FLAGS.warmup_epochs:
        config.lr_warmup_epoch = FLAGS.warmup_epochs
    config.backbone_init = FLAGS.backbone_init
    config.mixed_precision = FLAGS.amp
    config.image_size = model_utils.parse_image_size(config.image_size)

    # get eval config
    eval_config = hparams_config.get_detection_config(FLAGS.model_name)
    eval_config.override(FLAGS.hparams)
    eval_config.val_json_file = FLAGS.val_json_file
    eval_config.val_file_pattern = FLAGS.val_file_pattern
    eval_config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    eval_config.drop_remainder = False  # eval all examples w/o drop.
    eval_config.image_size = model_utils.parse_image_size(
        eval_config['image_size'])

    # setup
    setup.set_flags(FLAGS, config, training=True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    # Check data path
    if FLAGS.training_file_pattern is None or FLAGS.val_file_pattern is None or FLAGS.val_json_file is None:
        raise RuntimeError(
            'You must specify --training_file_pattern, --val_file_pattern and --val_json_file  for training.'
        )

    steps_per_epoch = (FLAGS.num_examples_per_epoch +
                       (FLAGS.batch_size * get_world_size()) -
                       1) // (FLAGS.batch_size * get_world_size())
    if FLAGS.benchmark == True:
        # For ci perf training runs, run for a fixed number of iterations per epoch
        steps_per_epoch = FLAGS.benchmark_steps
    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  checkpoint_period=FLAGS.checkpoint_period,
                  batch_size=FLAGS.batch_size,
                  num_shards=get_world_size(),
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode='train')
    logging.info('Training params: {}'.format(params))

    # make output dir if it does not exist
    tf.io.gfile.makedirs(FLAGS.model_dir)

    # dllogger setup
    backends = []
    if is_main_process():
        log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename)
        backends += [
            JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path),
            StdOutBackend(verbosity=Verbosity.DEFAULT)
        ]

    DLLogger.init(backends=backends)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            enable_map_parallelization=FLAGS.enable_map_parallelization)(
                params)

    num_samples = (FLAGS.eval_samples + get_world_size() -
                   1) // get_world_size()
    num_samples = (num_samples + FLAGS.eval_batch_size -
                   1) // FLAGS.eval_batch_size
    eval_config.num_samples = num_samples

    def get_eval_dataset(eval_config):
        dataset = dataloader.InputReader(
            FLAGS.val_file_pattern,
            is_training=False,
            max_instances_per_image=eval_config.max_instances_per_image)(
                eval_config, batch_size=FLAGS.eval_batch_size)
        dataset = dataset.shard(get_world_size(), get_rank())
        dataset = dataset.take(num_samples)
        return dataset

    eval_dataset = get_eval_dataset(eval_config)

    # pick focal loss implementation
    focal_loss = train_lib.StableFocalLoss(
        params['alpha'],
        params['gamma'],
        label_smoothing=params['label_smoothing'],
        reduction=tf.keras.losses.Reduction.NONE)

    model = train_lib.EfficientDetNetTrain(params['model_name'], config)
    model.build((None, *config.image_size, 3))
    model.compile(
        optimizer=optimizer_builder.get_optimizer(params),
        loss={
            'box_loss':
            train_lib.BoxLoss(params['delta'],
                              reduction=tf.keras.losses.Reduction.NONE),
            'box_iou_loss':
            train_lib.BoxIouLoss(params['iou_loss_type'],
                                 params['min_level'],
                                 params['max_level'],
                                 params['num_scales'],
                                 params['aspect_ratios'],
                                 params['anchor_scale'],
                                 params['image_size'],
                                 reduction=tf.keras.losses.Reduction.NONE),
            'class_loss':
            focal_loss,
            'seg_loss':
            tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        })
    train_from_epoch = util_keras.restore_ckpt(model,
                                               params['model_dir'],
                                               config.moving_average_decay,
                                               steps_per_epoch=steps_per_epoch)

    print("training_mode: {}".format(FLAGS.training_mode))
    callbacks = callback_builder.get_callbacks(params, FLAGS.training_mode,
                                               eval_config, eval_dataset,
                                               DLLogger, FLAGS.time_history,
                                               FLAGS.log_steps, FLAGS.lr_tb,
                                               FLAGS.benchmark)

    history = model.fit(
        get_dataset(True, params=params),
        epochs=params['num_epochs'],
        steps_per_epoch=steps_per_epoch,
        initial_epoch=train_from_epoch,
        callbacks=callbacks,
        verbose=1 if is_main_process() else 0,
        validation_data=get_dataset(False, params=params)
        if FLAGS.validate else None,
        validation_steps=(FLAGS.eval_samples //
                          FLAGS.eval_batch_size) if FLAGS.validate else None)

    if is_main_process():
        model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))

    # log final stats
    stats = {}
    for callback in callbacks:
        if isinstance(callback, callback_builder.TimeHistory):
            if callback.epoch_runtime_log:
                stats[
                    'avg_fps_training'] = callback.average_examples_per_second
                stats[
                    'avg_fps_training_per_GPU'] = callback.average_examples_per_second / get_world_size(
                    )
                stats[
                    'avg_latency_training'] = callback.average_time_per_iteration

    if history and history.history:
        train_hist = history.history
        #Gets final loss from training.
        stats['training_loss'] = float(
            hvd.allreduce(tf.constant(train_hist['loss'][-1],
                                      dtype=tf.float32),
                          average=True))

    if os.path.exists(os.path.join(FLAGS.model_dir, 'ema_weights')):
        ckpt_epoch = "%02d" % sorted(set([
            int(f.rsplit('.')[0].rsplit('-')[1])
            for f in os.listdir(os.path.join(FLAGS.model_dir, 'ema_weights'))
            if 'emackpt' in f
        ]),
                                     reverse=True)[0]
        ckpt = os.path.join(FLAGS.model_dir, 'ema_weights',
                            'emackpt-' + str(ckpt_epoch))
        util_keras.restore_ckpt(model,
                                ckpt,
                                eval_config.moving_average_decay,
                                steps_per_epoch=0,
                                skip_mismatch=False,
                                expect_partial=True)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'emackpt-final'))
    else:
        ckpt_epoch = 'final'
        ckpt = os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch))

    # Start evaluation of final ema checkpoint
    logging.set_verbosity(logging.WARNING)

    @tf.function
    def model_fn(images, labels):
        cls_outputs, box_outputs = model(images, training=False)
        detections = postprocess.generate_detections(eval_config, cls_outputs,
                                                     box_outputs,
                                                     labels['image_scales'],
                                                     labels['source_ids'])

        tf.numpy_function(evaluator.update_state, [
            labels['groundtruth_data'],
            postprocess.transform_detections(detections)
        ], [])

    if FLAGS.benchmark == False and FLAGS.training_mode == 'train':

        # Evaluator for AP calculation.
        label_map = label_util.get_label_map(eval_config.label_map)
        evaluator = coco_metric.EvaluationMetric(
            filename=eval_config.val_json_file, label_map=label_map)

        evaluator.reset_states()

        # evaluate all images.
        pbar = tf.keras.utils.Progbar(num_samples)
        for i, (images, labels) in enumerate(eval_dataset):
            model_fn(images, labels)
            if is_main_process():
                pbar.update(i)

        # gather detections from all ranks
        evaluator.gather()

        if is_main_process():
            # compute the final eval results.
            metrics = evaluator.result()
            metric_dict = {}
            for i, name in enumerate(evaluator.metric_names):
                metric_dict[name] = metrics[i]

            if label_map:
                for i, cid in enumerate(sorted(label_map.keys())):
                    name = 'AP_/%s' % label_map[cid]
                    metric_dict[name] = metrics[i +
                                                len(evaluator.metric_names)]

            # csv format
            csv_metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl']
            csv_format = ",".join(
                [str(ckpt_epoch)] +
                [str(round(metric_dict[key] * 100, 2)) for key in csv_metrics])
            print(FLAGS.model_name, metric_dict, "csv format:", csv_format)

        MPI.COMM_WORLD.Barrier()

    if is_main_process():
        stats['e2e_training_time'] = time.time() - begin
        DLLogger.log(step=(), data=stats)
Exemple #3
0
def main(_):
  model_config = hparams_config.get_detection_config(FLAGS.model_name)
  model_config.override(FLAGS.hparams)  # Add custom overrides
  model_config.is_training_bn = False
  model_config.image_size = model_utils.parse_image_size(model_config.image_size)

  # A hack to make flag consistent with nms configs.
  if FLAGS.min_score_thresh:
    model_config.nms_configs.score_thresh = FLAGS.min_score_thresh
  if FLAGS.nms_method:
    model_config.nms_configs.method = FLAGS.nms_method
  if FLAGS.max_boxes_to_draw:
    model_config.nms_configs.max_output_size = FLAGS.max_boxes_to_draw
  model_config.mixed_precision = FLAGS.amp

  setup.set_flags(FLAGS, model_config, training=False)
  model_params = model_config.as_dict()
  ckpt_path_or_file = FLAGS.ckpt_path
  if tf.io.gfile.isdir(ckpt_path_or_file):
    ckpt_path_or_file = tf.train.latest_checkpoint(ckpt_path_or_file)
  driver = inference.ServingDriver(FLAGS.model_name, ckpt_path_or_file,
                                   FLAGS.batch_size or None,
                                   FLAGS.min_score_thresh,
                                   FLAGS.max_boxes_to_draw, model_params)
  # dllogger setup
  backends = []
  backends+=[
    JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=FLAGS.dllogger_path),
    StdOutBackend(verbosity=Verbosity.DEFAULT)]
  DLLogger.init(backends=backends)

  if FLAGS.mode == 'export':
    if tf.io.gfile.exists(FLAGS.saved_model_dir):
      tf.io.gfile.rmtree(FLAGS.saved_model_dir)
    driver.export(FLAGS.saved_model_dir, FLAGS.tflite_path, FLAGS.tensorrt)
  elif FLAGS.mode == 'benchmark':
    if FLAGS.saved_model_dir:
      driver.load(FLAGS.saved_model_dir)

    batch_size = FLAGS.batch_size or 1
    if FLAGS.input_image:
      image_file = tf.io.read_file(FLAGS.input_image)
      image_arrays = tf.image.decode_image(image_file)
      image_arrays.set_shape((None, None, 3))
      image_arrays = tf.expand_dims(image_arrays, 0)
      if batch_size > 1:
        image_arrays = tf.tile(image_arrays, [batch_size, 1, 1, 1])
    else:
      # use synthetic data if no image is provided.
      image_arrays = tf.ones((batch_size, *model_config.image_size, 3),
                             dtype=tf.uint8)
    driver.benchmark(image_arrays, FLAGS.bm_runs, FLAGS.trace_filename)
  elif FLAGS.mode == 'dry':
    # transfer to tf2 format ckpt
    driver.build()
    if FLAGS.export_ckpt:
      driver.model.save_weights(FLAGS.export_ckpt)
  elif FLAGS.mode == 'video':
    import cv2  # pylint: disable=g-import-not-at-top
    if tf.saved_model.contains_saved_model(FLAGS.saved_model_dir):
      driver.load(FLAGS.saved_model_dir)
    cap = cv2.VideoCapture(FLAGS.input_video)
    if not cap.isOpened():
      print('Error opening input video: {}'.format(FLAGS.input_video))

    out_ptr = None
    if FLAGS.output_video:
      frame_width, frame_height = int(cap.get(3)), int(cap.get(4))
      out_ptr = cv2.VideoWriter(FLAGS.output_video,
                                cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), 25,
                                (frame_width, frame_height))

    while cap.isOpened():
      # Capture frame-by-frame
      ret, frame = cap.read()
      if not ret:
        break

      raw_frames = np.array([frame])
      detections_bs = driver.serve(raw_frames)
      boxes, scores, classes, _ = tf.nest.map_structure(np.array, detections_bs)
      new_frame = driver.visualize(
          raw_frames[0],
          boxes[0],
          scores[0],
          classes[0],
          min_score_thresh=model_config.nms_configs.score_thresh,
          max_boxes_to_draw=model_config.nms_configs.max_output_size)

      if out_ptr:
        # write frame into output file.
        out_ptr.write(new_frame)
      else:
        # show the frame online, mainly used for real-time speed test.
        cv2.imshow('Frame', new_frame)
        # Press Q on keyboard to  exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
          break
Exemple #4
0
warnings.simplefilter("ignore")
import tensorflow as tf
import horovod.tensorflow as hvd
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
import dllogger as DLLogger
from utils import hvd_utils

from utils.setup import set_flags
from runtime import Runner
from utils.cmdline_helper import parse_cmdline

if __name__ == "__main__":

    hvd.init()
    FLAGS = parse_cmdline()
    set_flags(FLAGS)

    backends = []
    if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
        # Prepare Model Dir
        log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.model_dir, exist_ok=True)
        # Setup dlLogger
        backends+=[
            JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path),
            StdOutBackend(verbosity=Verbosity.DEFAULT)
        ]
    DLLogger.init(backends=backends)
    DLLogger.log(data=vars(FLAGS), step='PARAMETER')

    runner = Runner(FLAGS, DLLogger)
Exemple #5
0
    # get model hyperparameters from the user-provided model config
    model_config = import_module(get_module_path(FLAGS.cfg))
    model_config = Config(model_config.config)

    #override model hyper parameters by those provided by the user via cmd
    model_config.override(FLAGS.mparams)
    config.mparams = model_config

    # make sure number of classes in the model config is consistent with data loader config
    config.num_classes = config.mparams.num_classes

    #========== horovod initialization
    hvd.init()

    #========== set up env variables, tf flags, and seeds
    set_flags(config)

    #========== set up the loggers and log dir
    backends = []
    if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
        # Prepare Model Dir
        os.makedirs(config.model_dir, exist_ok=True)

        # Setup dlLogger
        backends += [
            JSONStreamBackend(verbosity=Verbosity.VERBOSE,
                              filename=config.log_filename),
            StdOutBackend(verbosity=Verbosity.DEFAULT)
        ]
    DLLogger.init(backends=backends)
    DLLogger.log(data=vars(config), step='PARAMETER')
def main(_):
    """
    Starting point of the application
    """
    hvd.init()
    set_flags()
    params = parse_args(PARSER.parse_args())
    model_dir = prepare_model_dir(params)
    logger = get_logger(params)

    estimator = build_estimator(params, model_dir)

    dataset = Dataset(data_dir=params.data_dir,
                      batch_size=params.batch_size,
                      fold=params.crossvalidation_idx,
                      augment=params.augment,
                      gpu_id=hvd.rank(),
                      num_gpus=hvd.size(),
                      seed=params.seed)

    if 'train' in params.exec_mode:
        max_steps = params.max_steps // (1 if params.benchmark else hvd.size())
        hooks = [hvd.BroadcastGlobalVariablesHook(0),
                 TrainingHook(logger,
                              max_steps=max_steps,
                              log_every=params.log_every)]

        if params.benchmark and hvd.rank() == 0:
            hooks.append(ProfilingHook(logger,
                                       batch_size=params.batch_size,
                                       log_every=params.log_every,
                                       warmup_steps=params.warmup_steps,
                                       mode='train'))

        estimator.train(
            input_fn=dataset.train_fn,
            steps=max_steps,
            hooks=hooks)

    if 'evaluate' in params.exec_mode:
        if hvd.rank() == 0:
            results = estimator.evaluate(input_fn=dataset.eval_fn, steps=dataset.eval_size)
            logger.log(step=(),
                       data={"eval_ce_loss": float(results["eval_ce_loss"]),
                             "eval_dice_loss": float(results["eval_dice_loss"]),
                             "eval_total_loss": float(results["eval_total_loss"]),
                             "eval_dice_score": float(results["eval_dice_score"])})

    if 'predict' in params.exec_mode:
        if hvd.rank() == 0:
            predict_steps = dataset.test_size
            hooks = None
            if params.benchmark:
                hooks = [ProfilingHook(logger,
                                       batch_size=params.batch_size,
                                       log_every=params.log_every,
                                       warmup_steps=params.warmup_steps,
                                       mode="test")]
                predict_steps = params.warmup_steps * 2 * params.batch_size

            predictions = estimator.predict(
                input_fn=lambda: dataset.test_fn(count=math.ceil(predict_steps / dataset.test_size)),
                hooks=hooks)
            binary_masks = [np.argmax(p['logits'], axis=-1).astype(np.uint8) * 255 for p in predictions]

            if not params.benchmark:
                multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR)
                                 for mask in binary_masks]

                output_dir = os.path.join(params.model_dir, 'pred')

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'),
                                      compression="tiff_deflate",
                                      save_all=True,
                                      append_images=multipage_tif[1:])