Exemple #1
0
def main(_):
    common.initialize_preloading()
    if flags.FLAGS.use_horovod and flags.FLAGS.distribution_strategy != "off":
        raise RuntimeError(
            "Horovod and distribution strategy cannot be used together. Please select one of the scaleout methods."
        )
    if flags.FLAGS.distribution_strategy not in ["off", "hpu"]:
        raise RuntimeError(
            "Currently HPU supports only HPUStrategy, please set --distribution_strategy=hpu or use horovod"
        )
    if flags.FLAGS.use_horovod:
        if flags.FLAGS.horovod_hierarchical_allreduce:
            os.environ['HOROVOD_HIERARCHICAL_ALLREDUCE'] = "1"
        hvd_init()
    else:
        synapse_logger_init()
    load_habana_module()

    if flags.FLAGS.global_seed:
        tf.random.set_seed(flags.FLAGS.global_seed)

    with dump_callback():
        model_helpers.apply_clean(flags.FLAGS)
        with logger.benchmark_context(flags.FLAGS):
            stats = run(flags.FLAGS)
        logging.info('Run stats:\n%s', stats)
def main():
    if not args.no_hpu:
        # Load Habana module in order to do inference on HPU (Gaudi)
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

    checkpoint_path = os.path.join(args.model_dir, 'checkpoints')
    model = T5.from_pretrained(checkpoint_path)
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        os.path.join(args.data_dir, 't5_base', 'tokenizer'))

    print('\nProvide context and ask model a question, for example:')
    context = (
        'In 2019 Habana Labs announced its first AI accelerator. Gaudi, '
        'named after famous Catalan architect, was designed to accelerate '
        'training of deep neural networks in data centers.')
    question = 'What is the name of the chip?'
    print('Context:', context)
    print('Question:', question)
    print("Answer: ", answer(model, tokenizer, context, question))

    while True:
        print(
            '\nProvide context and ask model a question (to exit use Ctrl+C)')
        context = input('Context: ')
        question = input('Question: ')
        print("Answer: ", answer(model, tokenizer, context, question))
Exemple #3
0
    def _horovod_init(framework):
        size = comm_size()
        rank = comm_rank()

        hcl_config = get_hcl_config()
        hcl_type = get_hcl_type(hcl_config)

        if hcl_type != "HLS1-H":
            # All env variables should be set before loading_habana_modules
            if is_hierarchical():
                os.environ["HLS1_MODULE_ID"] = str(comm_local_rank())
                os.environ["ID"] = str(comm_local_rank())
            else:
                if size > 1:
                    os.environ["HLS1_MODULE_ID"] = str(get_hw_module_id(rank))
                    os.environ["ID"] = str(get_hw_module_id(rank))

        # Make sure every rank logging to different file
        # Only important on the same machine - so pretty much every scenarios
        if size > 1:
            rank_prefix = "rank_{}_".format(rank)
            HorovodHelpers._set_env_prefix("TF_RANK_PREFIX", rank_prefix,
                                           False)
            HorovodHelpers._set_env_prefix("HBN_TF_GRAPH_PREFIX", rank_prefix,
                                           False)
            HorovodHelpers._set_env_prefix("TF_DUMP_GRAPH_PREFIX", rank_prefix,
                                           True)
            HorovodHelpers._hvd_rank_prefix = rank_prefix

        # Init synapse logger (if required)
        synapse_logger_init()
        # Init TF Module (for CPU Allocator)
        load_habana_module()
        # Temporary WA to support both paths: with and without habana_frameworks package installed
        try:
            from habana_frameworks.tensorflow.lib_utils import libraries_location
            tf.load_library(
                os.path.join(libraries_location,
                             "libsynapse_helpers.so." + tf.__version__))
        except:
            logging.warning(
                "Can't import habana_frameworks, trying to run anyway")
        if framework == Framework.TENSORFLOW:
            import horovod.tensorflow as hvd
        elif framework == Framework.KERAS:
            import horovod.tensorflow.keras as hvd
        else:
            raise Exception(
                "Specified framework: {} is not supported by horovod_helpers".
                format(framework))

        hvd.init()
        assert rank == hvd.rank(
        ), "There is possible rank mismatch between mpi and horovod"
        HorovodHelpers._hvd = hvd
Exemple #4
0
def main(_):
    common.initialize_preloading()
    if flags.FLAGS.use_horovod:
        hvd_init()
    else:
        synapse_logger_init()
    load_habana_module()

    with dump_callback():
        model_helpers.apply_clean(flags.FLAGS)
        with logger.benchmark_context(flags.FLAGS):
            stats = run(flags.FLAGS)
        logging.info('Run stats:\n%s', stats)
Exemple #5
0
def main(_):
  tf.disable_v2_behavior()
  tf.enable_resource_variables()
  tf.logging.set_verbosity(tf.logging.INFO)
  trainer_lib.set_random_seed(FLAGS.random_seed)
  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  if FLAGS.use_hpu:
    if FLAGS.use_bf16:
      if not is_workaround_enabled('FORCE_FP32'):
          os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path
      else:
          print("Warning! BF16 precision is not supported in inference mode. Switching back to fp32...")
    if is_workaround_enabled('DISABLE_DYNAMIC_SHAPES'):
        os.environ['TF_ENABLE_DYNAMIC_SHAPES'] = 'false'
    from habana_frameworks.tensorflow import load_habana_module
    load_habana_module()
    prepare_recipe_cache()

  if FLAGS.score_file:
    filename = os.path.expanduser(FLAGS.score_file)
    if not tf.gfile.Exists(filename):
      raise ValueError("The file to score doesn't exist: %s" % filename)
    results = score_file(filename)
    if not FLAGS.decode_to_file:
      raise ValueError("To score a file, specify --decode_to_file for results.")
    write_file = tf.gfile.Open(os.path.expanduser(FLAGS.decode_to_file), "w")
    for score in results:
      write_file.write("%.6f\n" % score)
    write_file.close()
    return

  hp = create_hparams()
  hp.add_hparam("use_hpu", FLAGS.use_hpu)
  decode_hp = create_decode_hparams()
  run_config = trainer.create_run_config(hp)
  if FLAGS.disable_grappler_optimizations:
    run_config.session_config.graph_options.rewrite_options.disable_meta_optimizer = True

  # summary-hook in tf.estimator.EstimatorSpec requires
  # hparams.model_dir to be set.
  hp.add_hparam("model_dir", run_config.model_dir)

  estimator = trainer_lib.create_estimator(
      FLAGS.model,
      hp,
      run_config,
      decode_hparams=decode_hp,
      use_tpu=FLAGS.use_tpu)

  decode(estimator, hp, decode_hp)
Exemple #6
0
def set_flags(params):
    if params.tf_verbosity:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = params.tf_verbosity

    if params.no_hpu:
        os.environ['CUDA_CACHE_DISABLE'] = '1'
        os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0'
        os.environ['TF_ADJUST_HUE_FUSED'] = '1'
        os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
        os.environ['TF_SYNC_ON_FINISH'] = '0'
    else:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        if params.dtype == 'bf16':
            os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path
Exemple #7
0
def run_imagenet(flags_obj):
    """Run ResNet ImageNet training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results` and
      `train_hooks`. `eval_results` contains accuracy (top_1) and
      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
      training.
  """
    input_function = (flags_obj.use_synthetic_data and get_synth_input_fn(
        flags_core.get_tf_dtype(flags_obj)) or input_fn)

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if flags.FLAGS.dtype == 'bf16':
        os.environ['TF_BF16_CONVERSION'] = flags.FLAGS.bf16_config_path

    # Disabling dynamic shapes is a workaround. Dynamic shapes support for ResNeXt shall be investigated
    os.environ["TF_ENABLE_DYNAMIC_SHAPES"] = "false"

    os.environ.setdefault("TF_DISABLE_MKL", "1")
    os.environ.setdefault("TF_ALLOW_CONTROL_EDGES_IN_HABANA_OPS", "1")

    if flags_obj.use_horovod:
        assert flags_obj.no_hpu == False, "Horovod without HPU is not supported in helpers."
        hvd_init()
    else:
        synapse_logger_init()

    if not flags_obj.no_hpu:
        load_habana_module()

    result = resnet_run_loop.resnet_main(
        flags_obj,
        imagenet_model_fn,
        input_function,
        DATASET_NAME,
        shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])

    return result
Exemple #8
0
def set_flags(params):
    if params.tf_verbosity:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(params.tf_verbosity)

    if not params.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        if params.dtype == 'bf16':
            os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path

    np.random.seed(params.seed)
    tf.random.set_seed(params.seed)

    if params.use_xla:
        tf.config.optimizer.set_jit(True)

    per_hpu_thread_count = 1
    num_hpus = hvd_size() if horovod_enabled() else 1
    cpu_count = multiprocessing.cpu_count()
    total_hpu_thread_count = per_hpu_thread_count * num_hpus

    tf.config.threading.set_intra_op_parallelism_threads(0)
    tf.config.threading.set_inter_op_parallelism_threads(cpu_count - total_hpu_thread_count)
Exemple #9
0
def main(argv):
    del argv  # Unused.

    # ============================ Configure parameters ============================ #
    RUN_CONFIG = mask_rcnn_params.default_config()

    temp_config = FLAGS.flag_values_dict()
    if temp_config['device'] == 'HPU':
        if not MPI_is_distributed():
            from habana_frameworks.tensorflow import load_habana_module
            load_habana_module()

    temp_config['learning_rate_decay_levels'] = [
        float(decay) for decay in temp_config['learning_rate_decay_levels']
    ]
    temp_config['learning_rate_levels'] = [
        decay * temp_config['init_learning_rate']
        for decay in temp_config['learning_rate_decay_levels']
    ]
    temp_config['learning_rate_steps'] = [
        int(step) for step in temp_config['learning_rate_steps']
    ]

    RUN_CONFIG = params_io.override_hparams(RUN_CONFIG, temp_config)

    if temp_config['deterministic']:
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        if temp_config['seed']:
            os.environ['TF_DETERMINISTIC_OPS'] = '1'
            tf.compat.v1.reset_default_graph()
            tf.random.set_seed(temp_config['seed'])
            if temp_config['device'] == "GPU":
                os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
        else:
            raise RuntimeError("Set seed to run in deterministic mode")
    # ============================ Configure parameters ============================ #

    if RUN_CONFIG.use_tf_distributed and MPI_is_distributed():
        raise RuntimeError(
            "Incompatible Runtime. Impossible to use `--use_tf_distributed` with MPIRun Horovod"
        )

    if RUN_CONFIG.mode in ('train', 'train_and_eval'
                           ) and not RUN_CONFIG.training_file_pattern:
        raise RuntimeError(
            'You must specify `training_file_pattern` for training.')

    if RUN_CONFIG.mode in ('eval', 'train_and_eval'):
        if not RUN_CONFIG.validation_file_pattern:
            raise RuntimeError(
                'You must specify `validation_file_pattern` for evaluation.')

        if RUN_CONFIG.val_json_file == "" and not RUN_CONFIG.include_groundtruth_in_features:
            raise RuntimeError(
                'You must specify `val_json_file` or include_groundtruth_in_features=True for evaluation.'
            )

        if not RUN_CONFIG.include_groundtruth_in_features and not os.path.isfile(
                RUN_CONFIG.val_json_file):
            raise FileNotFoundError("Validation JSON File not found: %s" %
                                    RUN_CONFIG.val_json_file)

    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=RUN_CONFIG.log_path)
    ])

    if RUN_CONFIG.mode in ('train', 'train_and_eval'):

        train_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.training_file_pattern,
            mode=tf.estimator.ModeKeys.TRAIN,
            num_examples=None,
            use_fake_data=RUN_CONFIG.use_fake_data,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed)

    else:
        train_input_fn = None

    if RUN_CONFIG.mode in ('eval', 'train_and_eval') or (
            RUN_CONFIG.mode == 'train' and RUN_CONFIG.eval_after_training):

        eval_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.validation_file_pattern,
            mode=tf.estimator.ModeKeys.PREDICT,
            num_examples=RUN_CONFIG.eval_samples,
            use_fake_data=False,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed)

    else:
        eval_input_fn = None

    with tf.profiler.experimental.Profile(
            RUN_CONFIG.model_dir) if RUN_CONFIG.profile else suppress():
        run_executer(RUN_CONFIG, train_input_fn, eval_input_fn)
Exemple #10
0
def main():
    parser = DenseNetArgumentParser(
        description=(
            "train.py is the main training/evaluation script for DenseNet. "
            "In order to run training on multiple Gaudi cards, use demo_densenet.py or run "
            "train.py with mpirun."))
    args, _ = parser.parse_known_args()

    strategy = None
    verbose = 1

    os.environ['ENABLE_EXPERIMENTAL_FLAGS'] = 'true'
    os.environ['RUN_TPC_FUSER'] = '******'

    if args.deterministic:
        if args.inputs is None:
            raise ValueError("Must provide inputs for deterministic mode")
        if args.resume_from_checkpoint_path is None:
            raise ValueError("Must provide checkpoint for deterministic mode")

    if args.dtype == 'bf16':
        os.environ['TF_BF16_CONVERSION'] = '1'

    if args.run_on_hpu:
        load_habana_module()
        if args.use_hpu_strategy:
            hls_addresses = str(os.environ.get(
                "MULTI_HLS_IPS", "127.0.0.1")).split(",")
            TF_BASE_PORT = 2410
            mpi_rank = comm_rank()
            mpi_size = comm_size()
            if mpi_rank > 0:
                verbose = 0
            worker_hosts = ""
            for address in hls_addresses:
                # worker_hosts: comma-separated list of worker ip:port pairs.
                worker_hosts = worker_hosts + ",".join(
                    [address + ':' + str(TF_BASE_PORT + rank)
                     for rank in range(mpi_size//len(hls_addresses))])
            task_index = mpi_rank

            # Configures cluster spec for distribution strategy.
            _ = distribution_utils.configure_cluster(worker_hosts, task_index)
            strategy = HPUStrategy()
            print('Number of devices: {}'.format(
                strategy.num_replicas_in_sync))
    else:
        strategy = tf.distribute.MultiWorkerMirroredStrategy()
        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    if args.seed is not None:
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        random.seed(args.seed)
        np.random.seed(args.seed)
        tf.random.set_seed(args.seed)

    img_rows, img_cols = 224, 224  # Resolution of inputs
    channel = 3
    num_classes = 1000
    batch_size = args.batch_size
    nb_epoch = args.epochs
    dataset_dir = args.dataset_dir
    resume_from_checkpoint_path = args.resume_from_checkpoint_path
    resume_from_epoch = args.resume_from_epoch
    dropout_rate = args.dropout_rate
    weight_decay = args.weight_decay
    optim_name = args.optimizer
    initial_lr = args.initial_lr
    model_name = args.model
    save_summary_steps = args.save_summary_steps

    if model_name == "densenet121":
        growth_rate = 32
        nb_filter = 64
        nb_layers = [6, 12, 24, 16]

    elif model_name == "densenet161":
        growth_rate = 48
        nb_filter = 96
        nb_layers = [6, 12, 36, 24]

    elif model_name == "densenet169":
        growth_rate = 32
        nb_filter = 64
        nb_layers = [6, 12, 32, 32]

    else:
        print("model is not supported")
        exit(1)

    # Load our model
    if strategy:
        with strategy.scope():
            model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel,
                                   dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes,
                                   growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers)
            optimizer = get_optimizer(
                model_name, optim_name, initial_lr, epsilon=1e-2)
            model.compile(optimizer=optimizer,
                          loss='categorical_crossentropy', metrics=['accuracy'])
    else:
        model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel,
                               dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes,
                               growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers)
        optimizer = get_optimizer(
            model_name, optim_name, initial_lr, epsilon=1e-2)
        model.compile(optimizer=optimizer,
                      loss='categorical_crossentropy', metrics=['accuracy'])

    # Start training
    steps_per_epoch = 1281167 // batch_size
    if args.steps_per_epoch is not None:
        steps_per_epoch = args.steps_per_epoch
    validation_steps = 50000 // batch_size
    if args.validation_steps is not None:
        validation_steps = args.validation_steps
    warmup_steps = args.warmup_epochs * steps_per_epoch
    lr_sched = {0: 1, 30: 0.1, 60: 0.01, 80: 0.001}
    lr_sched_steps = {
        epoch * steps_per_epoch: multiplier for (epoch, multiplier) in lr_sched.items()}

    lrate = StepLearningRateScheduleWithWarmup(initial_lr=initial_lr,
                                               initial_global_step=0,
                                               warmup_steps=warmup_steps,
                                               decay_schedule=lr_sched_steps,
                                               verbose=0)

    save_name = model_name if not model_name.endswith('.h5') else \
        os.path.split(model_name)[-1].split('.')[0].split('-')[0]

    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(args.model_dir, config.SAVE_DIR,
                     save_name) + '-ckpt-{epoch:03d}.h5',
        monitor='train_loss')

    callbacks = [lrate, model_ckpt]

    if save_summary_steps is not None and save_summary_steps > 0:
        log_dir = os.path.join(args.model_dir, config.LOG_DIR)
        local_batch_size = batch_size
        
        if args.use_hpu_strategy:
            log_dir = os.path.join(log_dir, 'worker_' + str(comm_rank()))
            local_batch_size = batch_size // strategy.num_replicas_in_sync

        callbacks += [
            TensorBoardWithHParamsV2(
                args.__dict__, log_dir=log_dir,
                update_freq=save_summary_steps, profile_batch=0),
            ExamplesPerSecondKerasHookV2(
                save_summary_steps, output_dir=log_dir,
                batch_size=local_batch_size),
        ]

    if (args.evaluate_checkpoint_path is not None):
        model.load_weights(args.evaluate_checkpoint_path)
        results = model.evaluate(x=ds_valid, steps=validation_steps)
        print("Test loss, Test acc:", results)
        exit()

    if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)):
        model.load_weights(resume_from_checkpoint_path)

    if args.deterministic:
        set_deterministic()
        if not os.path.isfile(args.dump_config):
            raise FileNotFoundError("wrong dump config path")

        import pickle
        x_path = os.path.join(args.inputs, "input")
        y_path = os.path.join(args.inputs, "target")
        x = pickle.load(open(x_path, 'rb'))
        y = pickle.load(open(y_path, 'rb'))

        with dump_callback(args.dump_config):
          model.fit(x=x, y=y,
                  steps_per_epoch=steps_per_epoch,
                  callbacks=callbacks,
                  initial_epoch=resume_from_epoch,
                  epochs=nb_epoch,
                  shuffle=False,
                  verbose=verbose,
                  validation_data=None,
                  validation_steps=0,
                  )
    else:
      ds_train = get_dataset(dataset_dir, args.train_subset, batch_size)
      ds_valid = get_dataset(dataset_dir, args.val_subset, batch_size)

      model.fit(x=ds_train, y=None,
                steps_per_epoch=steps_per_epoch,
                callbacks=callbacks,
                initial_epoch=resume_from_epoch,
                epochs=nb_epoch,
                shuffle=True,
                verbose=verbose,
                validation_data=(ds_valid, None),
                validation_steps=validation_steps,
                validation_freq=1,
                )
Exemple #11
0
def main(argv):
  tf.disable_v2_behavior()
  tf.enable_resource_variables()

  if FLAGS.use_hpu and FLAGS.recipe_cache:
    prepare_recipe_cache()

  if FLAGS.use_horovod:
    if FLAGS.use_hpu:
      from TensorFlow.common.horovod_helpers import hvd_init, horovod_enabled, hvd
      hvd_init()
      assert horovod_enabled()
      if FLAGS.recipe_cache:
        # Other ranks should wait for recipe cache to be removed.
        # This operation can't be done before hvd_init.
        from mpi4py import MPI
        MPI.COMM_WORLD.Barrier()
    else:
      import horovod.tensorflow as hvd
      hvd.init()
      assert hvd.size() > 1
      os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())

  if FLAGS.use_hpu:
    if FLAGS.use_bf16:
      os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    dyn_shapes_flag = 'TF_ENABLE_DYNAMIC_SHAPES'
    if dyn_shapes_flag not in os.environ:
        os.environ[dyn_shapes_flag] = 'false'

    from habana_frameworks.tensorflow import load_habana_module  # noqa
    load_habana_module()

  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  # If we just have to print the registry, do that and exit early.
  maybe_log_registry_and_exit()

  # Create HParams.
  if argv:
    set_hparams_from_args(argv[1:])
  if FLAGS.schedule != "run_std_server":
    hparams = create_hparams()
  if FLAGS.gpu_automatic_mixed_precision:
    setattr(hparams, "gpu_automatic_mixed_precision", True)
  if FLAGS.deterministic_dataset:
    hparams.add_hparam("deterministic_dataset", True)

  hparams.add_hparam("use_horovod", FLAGS.use_horovod)
  hparams.add_hparam("use_hpu", FLAGS.use_hpu)
  if FLAGS.use_horovod:
    hparams.add_hparam("hvd_worker_id", hvd.rank())
    hparams.add_hparam("hvd_size", hvd.size())

  if FLAGS.schedule == "run_std_server":
    run_std_server()
  trainer_lib.set_random_seed(FLAGS.random_seed)

  if FLAGS.generate_data:
    generate_data()

  exp_fn = create_experiment_fn()
  exp = exp_fn(create_run_config(hparams), hparams)
  if is_chief():
    save_metadata(hparams)

  with dump_callback():
    execute_schedule(exp)
Exemple #12
0
def train(model,
          train_images,
          train_annotations,
          input_height=None,
          input_width=None,
          n_classes=None,
          verify_dataset=True,
          checkpoints_path=None,
          epochs=5,
          batch_size=2,
          validate=False,
          val_images=None,
          val_annotations=None,
          auto_resume_checkpoint=False,
          load_weights=None,
          steps_per_epoch=None,
          val_steps_per_epoch=None,
          gen_use_multiprocessing=False,
          ignore_zero_class=False,
          optimizer_name='adam',
          do_augment=False,
          augmentation_name="aug_all",
          data_type='fp32',
          tb_location=None,
          deterministic=False,
          model_dir=None,
          dump_config=None,
          distributed=False,
          use_upsampling=False,
          loss_type=0,
          train_engine='hpu',
          not_cached=False):

    if train_engine == 'hpu':
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        print("Loaded HPU modules")
        from TensorFlow.common.debug import dump_callback
        # For Habana Model runner hooks
        from TensorFlow.common.tb_utils import (TensorBoardWithHParamsV2,
                                                ExamplesPerSecondKerasHookV2)
    else:

        class dump_callback(object):
            def __init__(self, file_name):
                pass

            def __enter__(self):
                pass

            def __exit__(self, type, value, traceback):
                pass

    if data_type == 'bf16' and train_engine == 'hpu':
        bf16_json = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 '../bf16_segnet.json')
        os.environ['TF_BF16_CONVERSION'] = os.environ.get(
            'TF_BF16_CONVERSION', bf16_json)
        print("Setting BF16:", os.getenv('TF_BF16_CONVERSION'))

    shard_id = 0
    num_shards = 1

    if distributed:
        import horovod.tensorflow.keras as hvd
        print("hvd init")
        hvd.init()
        if train_engine == 'gpu':
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')
            print("Set memory growth for GPUS")

        shard_id = hvd.rank()
        num_shards = hvd.size()
        if num_shards == 1:
            print(
                "Distributed training requested but horovod init not success")
            exit()

    print("num_shards: " + str(num_shards) + " shard_id: " + str(shard_id))

    from keras_segmentation.models.all_models import model_from_name
    # check if user gives model name instead of the model object
    if isinstance(model, six.string_types):
        # create the model from the name
        assert (n_classes is not None), "Please provide the n_classes"
        if (input_height is not None) and (input_width is not None):
            model = model_from_name[model](n_classes,
                                           input_height=input_height,
                                           input_width=input_width,
                                           batch_size=batch_size,
                                           use_upsampling=use_upsampling,
                                           loss_type=loss_type)
        else:
            model = model_from_name[model](n_classes,
                                           batch_size=batch_size,
                                           use_upsampling=use_upsampling,
                                           loss_type=loss_type)

    #model.save('my_segnet_model.h5')
    n_classes = model.n_classes
    input_height = model.input_height
    input_width = model.input_width
    output_height = model.output_height
    output_width = model.output_width

    if steps_per_epoch is None:
        steps_per_epoch = len(
            os.listdir(train_images)) // (batch_size * num_shards)
    if val_steps_per_epoch is None:
        val_steps_per_epoch = len(os.listdir(val_images)) // batch_size

    print("Steps per epoch: " + str(steps_per_epoch))

    def optimized_xent_loss_custom_grad(ytrue, ypred):
        @tf.custom_gradient
        def loss_without_mean(ytrue, ypred):
            with tf.name_scope("softmax_cross_entropy"):
                logits_t = tf.transpose(ypred,
                                        perm=(0, 1, 3, 2),
                                        name="logits_t")  # BS H N W
                reduce_max = tf.reduce_max(logits_t, 2,
                                           name="reduce_max")  # BS H W
                max_logits = tf.expand_dims(reduce_max, 3)  # BS H W 1
                shifted_logits = tf.subtract(ypred,
                                             max_logits,
                                             name="shifted_logits")  # BS H W N
                exp_shifted_logits = tf.math.exp(
                    shifted_logits, name="exp_shifted_logits")  # BS H W N
                reduce_sum_filter = tf.fill([1, 1, n_classes, 1], 1.0)
                sum_exp = tf.nn.conv2d(exp_shifted_logits,
                                       reduce_sum_filter,
                                       strides=1,
                                       padding="VALID",
                                       name="sum_exp")  # BS H W 1
                log_sum_exp = tf.math.log(sum_exp,
                                          name="log_sum_exp")  # BS H W 1
                shifted_logits2 = tf.nn.conv2d(
                    shifted_logits * ytrue,
                    reduce_sum_filter,
                    strides=1,
                    padding="VALID",
                    name="shifted_logits2")  # BS H W 1
                loss = tf.subtract(log_sum_exp,
                                   shifted_logits2,
                                   name="loss/sub")  # BS H W 1

                def custom_grad(dy):  # dy is BS H W 1
                    with tf.name_scope("gradients/softmax_cross_entropy"):
                        div = tf.math.truediv(exp_shifted_logits,
                                              sum_exp,
                                              name="div")  # BS H W N
                        sub = tf.math.subtract(div, ytrue,
                                               name="sub")  # BS H W N
                        ret = tf.math.multiply(sub, dy, name="mul")
                    return -dy * shifted_logits, ret

                return loss, custom_grad

        return tf.math.reduce_mean(loss_without_mean(ytrue, ypred))

    if validate:
        assert val_images is not None
        assert val_annotations is not None

    if optimizer_name is not None:

        if ignore_zero_class:
            loss_k = masked_categorical_crossentropy
        elif loss_type == 1:
            loss_k = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
        elif loss_type == 2:
            loss_k = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        else:
            loss_k = optimized_xent_loss_custom_grad

        print(optimizer_name)
        if num_shards > 1:
            optimizer = Adam(lr=LearningRate)
            optimizer_name = hvd.DistributedOptimizer(optimizer)

        model.compile(loss=loss_k,
                      optimizer=optimizer_name,
                      metrics=['accuracy'])

    if checkpoints_path is not None:
        with open(checkpoints_path + "_config.json", "w") as f:
            json.dump(
                {
                    "model_class": model.model_name,
                    "n_classes": n_classes,
                    "input_height": input_height,
                    "input_width": input_width,
                    "output_height": output_height,
                    "output_width": output_width
                }, f)

    if load_weights is not None and len(load_weights) > 0:
        print("Loading weights from ", load_weights)
        status = model.load_weights(load_weights)
        print(status)

    if auto_resume_checkpoint and (checkpoints_path is not None):
        latest_checkpoint = find_latest_checkpoint(checkpoints_path)
        if latest_checkpoint is not None:
            print("Loading the weights from latest checkpoint ",
                  latest_checkpoint)
            model.load_weights(latest_checkpoint)

    if verify_dataset:
        print("Verifying training dataset")
        verified = verify_segmentation_dataset(train_images, train_annotations,
                                               n_classes, deterministic)
        assert verified
        if validate:
            print("Verifying validation dataset")
            verified = verify_segmentation_dataset(val_images, val_annotations,
                                                   n_classes, deterministic)
            assert verified

    if not_cached:
        train_gen = image_segmentation_generator(
            train_images,
            train_annotations,
            batch_size,
            n_classes,
            input_height,
            input_width,
            output_height,
            output_width,
            deterministic,
            do_augment=do_augment,
            augmentation_name=augmentation_name,
            num_shards=num_shards,
            shard_id=shard_id,
            loss_type=loss_type)
    else:
        train_gen = image_segmentation_generator(
            train_images,
            train_annotations,
            1,
            n_classes,
            input_height,
            input_width,
            output_height,
            output_width,
            deterministic,
            do_augment=do_augment,
            augmentation_name=augmentation_name,
            num_shards=num_shards,
            shard_id=shard_id,
            loss_type=loss_type)

        train_gen = cached_image_generator(train_gen, num_shards, shard_id,
                                           batch_size,
                                           len(os.listdir(train_images)),
                                           deterministic)

    callbacks = []

    if num_shards > 1:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())

    callbacks.append(CheckpointsCallback(checkpoints_path))
    #if shard_id == 0:
    #    callbacks.append(ModelCheckpoint( self.checkpoints_path, monitor='loss', verbose=2, mode='min', save_best_only=True, save_weights_only=True))

    if model_dir is not None:
        hparams = {
            "model_name": model,
            "optimizer": optimizer_name,
            "batch_size": batch_size
        }

        if train_engine == 'hpu':
            callbacks += [
                TensorBoardWithHParamsV2(hparams,
                                         log_dir=model_dir,
                                         update_freq=5),
                ExamplesPerSecondKerasHookV2(5,
                                             batch_size=batch_size,
                                             output_dir=model_dir)
            ]

    if tb_location != '':
        tensorboard_callback = TensorBoard(log_dir=tb_location,
                                           histogram_freq=1)
        callbacks.append(tensorboard_callback)
        print("TB:", tb_location)

    if not validate:
        with dump_callback(dump_config):
            start_compilation = time.time()
            model.fit(train_gen, steps_per_epoch=1, epochs=1)
            stop_compilation = time.time()
            history = model.fit(train_gen,
                                steps_per_epoch=steps_per_epoch,
                                epochs=epochs,
                                callbacks=callbacks,
                                verbose=1 if shard_id == 0 else 0)
            stop_training = time.time()
        with open('./trainHistoryDict_' + str(shard_id), 'wb') as file_pi:
            pickle.dump(history.history, file_pi)
        avg_time_per_batch = (stop_training -
                              stop_compilation) / (steps_per_epoch * epochs)
        print('Compile time in seconds:',
              (stop_compilation - start_compilation))
        print('Average time per batch in seconds (leaving out compilation):',
              avg_time_per_batch)
        print('Average time per image in seconds (leaving out compilation)',
              avg_time_per_batch / batch_size)
        print('Average images per sec (leaving out compilation):',
              batch_size / avg_time_per_batch)

        if loss_type == 1:
            print('Eval for LOSS_FUNC_TYPE=1 is WIP')
            exit()

        if shard_id == 0:
            if not_cached:
                val_gen = image_segmentation_generator(val_images,
                                                       val_annotations,
                                                       batch_size,
                                                       n_classes,
                                                       input_height,
                                                       input_width,
                                                       output_height,
                                                       output_width,
                                                       deterministic,
                                                       num_shards=1,
                                                       shard_id=shard_id,
                                                       loss_type=loss_type)
            else:
                val_gen = image_segmentation_generator(val_images,
                                                       val_annotations,
                                                       1,
                                                       n_classes,
                                                       input_height,
                                                       input_width,
                                                       output_height,
                                                       output_width,
                                                       deterministic,
                                                       num_shards=1,
                                                       shard_id=shard_id,
                                                       loss_type=loss_type)
                val_gen = cached_image_generator(val_gen, 1, 0, batch_size,
                                                 len(os.listdir(val_images)))
            f1_metric = FBetaScore(num_classes=n_classes)
            model.compile(loss=model.loss,
                          metrics=[
                              tf.keras.metrics.CategoricalAccuracy(
                                  name="categorical_accuracy", dtype=None),
                              f1_metric
                          ])
            test_loss, test_acc, test_f1 = model.evaluate(
                val_gen, steps=(len(os.listdir(val_images)) // batch_size))
            train_loss, train_acc, train_f1 = model.evaluate(
                train_gen, steps=(len(os.listdir(train_images)) // batch_size))
            print(
                f'test loss : {test_loss}, test accuracy : {test_acc}, test f1 : {test_f1}'
            )
            print(
                f'train loss : {train_loss}, train accuracy : {train_acc}, train f1 : {train_f1}'
            )

    else:
        assert (
            num_shards is
            1), "Only support training with validation with single HPU setup"
        if not_cached:
            val_gen = image_segmentation_generator(val_images,
                                                   val_annotations,
                                                   batch_size,
                                                   n_classes,
                                                   input_height,
                                                   input_width,
                                                   output_height,
                                                   output_width,
                                                   deterministic,
                                                   num_shards=num_shards,
                                                   shard_id=shard_id,
                                                   loss_type=loss_type)
        else:
            val_gen = image_segmentation_generator(val_images,
                                                   val_annotations,
                                                   1,
                                                   n_classes,
                                                   input_height,
                                                   input_width,
                                                   output_height,
                                                   output_width,
                                                   deterministic,
                                                   num_shards=num_shards,
                                                   shard_id=shard_id,
                                                   loss_type=loss_type)
            val_gen = cached_image_generator(val_gen, num_shards, shard_id,
                                             batch_size,
                                             len(os.listdir(val_images)),
                                             deterministic)

        start_compilation = time.time()
        model.fit(train_gen, steps_per_epoch=1, epochs=1)
        stop_compilation = time.time()
        model.fit(train_gen,
                  steps_per_epoch=steps_per_epoch,
                  validation_data=val_gen,
                  validation_steps=val_steps_per_epoch,
                  epochs=epochs,
                  callbacks=callbacks,
                  use_multiprocessing=gen_use_multiprocessing,
                  verbose=1 if shard_id == 0 else 0)
        stop_training = time.time()
        avg_time_per_batch = (stop_training -
                              stop_compilation) / (steps_per_epoch * epochs)
        print('Compile time in seconds:',
              (stop_compilation - start_compilation))
        print('Average time per batch in seconds (leaving out compilation):',
              avg_time_per_batch)
        print('Average time per image in seconds (leaving out compilation)',
              avg_time_per_batch / batch_size)
Exemple #13
0
                    action='store_true',
                    help='disables evaluation')
parser.add_argument('--dump_config',
                    type=str,
                    default=None,
                    help='Side-by-side config file. Internal, do not use.')
params = parser.parse_args()

print(
    f"Using TF {tf.__version__}, datasets {datasets.__version__}, transformers {transformers.__version__}"
)

# Load Habana module in order to train on HPU (Gaudi)
if not params.no_hpu:
    from habana_frameworks.tensorflow import load_habana_module
    load_habana_module()

# Load dataset
assert os.path.exists(params.data_dir), (
    f'"{params.data_dir}" does not exist! Use "prepare_data.py" to create required data.'
)

train_ds = datasets.load_from_disk(
    os.path.join(params.data_dir, 'squad', 'train'))
valid_ds = datasets.load_from_disk(
    os.path.join(params.data_dir, 'squad', 'valid'))

print("Example data from the mapped dataset: \n", next(iter(train_ds)))

tf_train_ds = dataset.to_tf_dataset(train_ds)
tf_valid_ds = dataset.to_tf_dataset(valid_ds)
Exemple #14
0
def train_mnist(use_hpu: bool, batch_size: int, use_bfloat: bool, num_epochs: int):
    """ Train the distributed model on MNIST Dataset.
    """
    # Set TF_CONFIG.
    set_tf_config()

    # Instantiate the distributed strategy class.
    if use_hpu:
        # Optionally enable automatic bfloat16 operations conversion.
        if use_bfloat:
            os.environ["TF_BF16_CONVERSION"] = "full"
            print(
                f"TF_BF16_CONVERSION = {os.environ['TF_BF16_CONVERSION']}")

        # Load Habana device support.
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

        # Use HPUStrategy (instead of MultiWorkerMirroredStrategy).
        from habana_frameworks.tensorflow.distribute import HPUStrategy
        strategy = HPUStrategy()
    else:
        strategy = tf.distribute.MultiWorkerMirroredStrategy()

    # Determine the total training batch size.
    batch_size_per_replica = batch_size
    total_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
    print(
        f"total_batch_size = {batch_size_per_replica} * {strategy.num_replicas_in_sync} workers = {total_batch_size}")

    # Load and preprocess the MNIST Dataset.
    # As tfds.load() may download the dataset if not cached, let the first worker do it first.
    for dataload_turn in range(2):
        if (dataload_turn == 0) == (worker_index == 0):
            print("Loading MNIST dataset...")
            datasets, info = tfds.load(
                name="mnist", with_info=True, as_supervised=True)
        MPI.COMM_WORLD.barrier()

    def preprocess(image, label):
        image = tf.cast(image, tf.float32) / 255.0
        label = tf.cast(label, tf.int32)
        return image, label

    train_dataset = datasets["train"]
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_dataset = train_dataset.with_options(options)
    train_dataset = train_dataset.map(
        preprocess).cache().shuffle(SHUFFLE_BUFFER_SIZE).batch(total_batch_size)

    test_dataset = datasets["test"]
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    test_dataset = test_dataset.with_options(options)
    test_dataset = test_dataset.map(
        preprocess).batch(total_batch_size)

    # Create and compile the distributed CNN model.
    with strategy.scope():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(
                32, 3, activation="relu", input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(10)
        ])

        model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(),
                      metrics=["accuracy"])

    # Train the model.
    print("Calling model.fit()...")
    model.fit(train_dataset, epochs=num_epochs, verbose=2)
    print("Calling model.evaluate()...")
    eval_results = model.evaluate(test_dataset, verbose=2)
    print(f"Evaluation results: {eval_results}")
Exemple #15
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)

    if params.runtime.num_hpus > 0:
        import os
        #TODO: remove when SW-49334 is fixed [SW-49404]
        os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1"
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

    if params.task.train_data.deterministic or params.task.validation_data.deterministic:
        import os
        os.environ['PYTHONHASHSEED'] = '0'
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        import numpy
        numpy.random.seed(0)
        import tensorflow as tf
        tf.random.set_seed(0)
        tf.compat.v1.set_random_seed(0)
        import random
        random.seed(0)

    if FLAGS.dtype == "bf16":
        print("Using bf16 config list {}".format(FLAGS.bf16_config_path))
        os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    hls_addresses = str(os.environ.get("MULTI_HLS_IPS",
                                       "127.0.0.1")).split(",")
    TF_BASE_PORT = 2410
    mpi_rank = comm_rank()
    mpi_size = comm_size()

    if params.runtime.num_hpus > 1:
        model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank))
    else:
        model_dir = FLAGS.model_dir

    #prepare a comma-seperated list of device addreses
    worker_list = []
    for address in hls_addresses:
        for rank in range(mpi_size // len(hls_addresses)):
            worker_list.append(address + ':' + str(TF_BASE_PORT + rank))
    worker_hosts = ",".join(worker_list)
    task_index = mpi_rank

    # Configures cluster spec for distribution strategy.
    distribution_utils.configure_cluster(worker_hosts, task_index)
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        num_hpus=params.runtime.num_hpus,
        tpu_address=params.runtime.tpu)

    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
Exemple #16
0
def main():
    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.add_argument('--dataset', '--dataset_dir', metavar='PATH',
                        default=config.DEFAULT_DATASET_DIR, help='Dataset directory.')
    parser.add_argument('--optimizer', default='sgd',
                        choices=['sgd', 'adam', 'rmsprop'], help='Optimizer.')
    parser.add_argument('-d', '--dtype', default='fp32',
                        choices=['fp32', 'bf16'], help='Data type.')
    parser.add_argument('--batch_size', type=int,
                        default=32, help='Global batch size.')
    parser.add_argument('--lr_sched', default='WarmupCosine', choices=[
                        'linear', 'exp', 'steps', 'constant', 'WarmupCosine'], help='Learning rate scheduler.')
    parser.add_argument('--initial_lr', type=float,
                        default=6e-2, help='Initial learning rate.')
    parser.add_argument('--final_lr', type=float,
                        default=1e-5, help='Final learning rate.')
    parser.add_argument('--warmup_steps', type=int,
                        default=4000, help='Warmup steps.')
    parser.add_argument('--epochs', type=int, default=10,
                        help='Total number of epochs for training.')
    parser.add_argument('--steps_per_epoch', type=int,
                        help='Number of steps for training per epoch, overrides default value.')
    parser.add_argument('--validation_steps', type=int,
                        help='Number of steps for validation, overrides default value.')
    parser.add_argument('--model', default='ViT-B_16',
                        choices=['ViT-B_16', 'ViT-L_16', 'ViT-B_32', 'ViT-L_32'], help='Model.')
    parser.add_argument('--train_subset', default='train',
                        help='Pattern to detect train subset in dataset directory.')
    parser.add_argument('--val_subset', default='validation',
                        help='Pattern to detect validation subset in dataset directory.')
    parser.add_argument('--grad_accum_steps', type=int,
                        default=8, help='Gradient accumulation steps.')
    parser.add_argument('--resume_from_checkpoint_path',
                        metavar='PATH', help='Path to checkpoint to start from.')
    parser.add_argument('--resume_from_epoch', metavar='EPOCH_INDEX',
                        type=int, default=0, help='Initial epoch index.')
    parser.add_argument('--evaluate_checkpoint_path', metavar='PATH',
                        help='Checkpoint path for evaluating the model on --val_subset')
    parser.add_argument('--weights_path', metavar='PATH',
                        help='Path to weights cache directory. ~/.keras is used if not set.')
    parser.add_argument('--deterministic', action='store_true', default=False,
                        help='Enable deterministic behavior, this will also disable data augmentation. --seed must be set.')
    parser.add_argument('--seed', type=int,
                        help='Seed to be used by random functions.')
    parser.add_argument('--device', default='HPU',
                        choices=['CPU', 'HPU'], help='Device type.')
    parser.add_argument('--distributed', action='store_true',
                        default=False, help='Enable distributed training.')
    parser.add_argument('--base_tf_server_port', type=int,
                        default=7850, help='Rank 0 port used by tf.distribute.')
    parser.add_argument('--save_summary_steps', type=int, default=0,
                        help='Steps between saving summaries to TensorBoard.')
    parser.add_argument('--recipe_cache', default='/tmp/vit_recipe_cache',
                        help='Path to recipe cache directory. Set to empty to disable recipe cache. Externally set \'TF_RECIPE_CACHE_PATH\' will override this setting.')
    parser.add_argument(
        '--dump_config', help='Side-by-side config file. Internal, do not use.')
    args = parser.parse_args()

    if args.weights_path is not None:
        config.WEIGHTS_DIR = args.weights_path

    if args.dtype == 'bf16':
        tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

    if args.device == 'HPU':
        if args.distributed:
            os.environ['TF_HCCL_MEMORY_ALLOWANCE_MB'] = '500'
        from habana_frameworks.tensorflow import load_habana_module
        from habana_frameworks.tensorflow.ops.layer_norm import HabanaLayerNormalization
        load_habana_module()
        tf.keras.layers.LayerNormalization = HabanaLayerNormalization

        # Handle recipe caching.
        recipe_cache = args.recipe_cache
        if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache:
            os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache

        # Clear previous recipe cache.
        if not args.distributed or comm_rank() == 0:
            if os.path.exists(recipe_cache) and os.path.isdir(recipe_cache):
                import shutil
                shutil.rmtree(recipe_cache)
        # Wait for rank 0 to remove cache.
        if args.distributed:
            from mpi4py import MPI
            MPI.COMM_WORLD.Barrier()

    # Handle determinism.
    config.DETERMINISTIC = args.deterministic
    config.SEED = args.seed
    if args.deterministic:
        assert args.seed is not None, "Deterministic behavior require seed to be set."
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        config.DATA_AUGMENTATION = False
    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        tf.random.set_seed(args.seed)

    # Handle distribution strategy.
    if args.distributed:
        tf_distribute_config(args.base_tf_server_port)
        if args.device == 'HPU':
            os.environ['HBN_TF_REGISTER_DATASETOPS'] = '1'
            from habana_frameworks.tensorflow.distribute import HPUStrategy
            strategy = HPUStrategy()
        else:
            strategy = tf.distribute.MultiWorkerMirroredStrategy()
    else:
        strategy = tf.distribute.OneDeviceStrategy(f'device:{args.device}:0')

    if not args.distributed or comm_rank() == 0:
        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    num_classes = 1000
    batch_size = args.batch_size
    nb_epoch = args.epochs
    dataset = args.dataset
    resume_from_checkpoint_path = args.resume_from_checkpoint_path
    resume_from_epoch = args.resume_from_epoch
    optim_name = args.optimizer
    initial_lr = args.initial_lr
    final_lr = args.final_lr
    lr_sched = args.lr_sched
    warmup_steps = args.warmup_steps
    model_name = args.model
    grad_accum_steps = args.grad_accum_steps

    ds_train = get_dataset(dataset, args.train_subset, batch_size,
                           is_training=True, distributed=args.distributed)
    ds_valid = get_dataset(dataset, args.val_subset,
                           batch_size, False, distributed=args.distributed)

    if args.dump_config is not None:
        vit.CONFIG_B['dropout'] = 0.0
        vit.CONFIG_L['dropout'] = 0.0

    # Load our model
    with strategy.scope():
        image_size = 384
        if model_name == 'ViT-B_16':
            model = vit.vit_b16(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-L_16':
            model = vit.vit_l16(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-B_32':
            model = vit.vit_b32(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-L_32':
            model = vit.vit_l32(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        else:
            print(
                "Model is not supported, please use either ViT-B_16 or ViT-L_16 or ViT-B_32 or ViT-L_32")
            exit(0)

        optimizer = get_optimizer(
            optim_name, initial_lr, accumulation_steps=grad_accum_steps, epsilon=1e-2)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy',
                      metrics=['accuracy'], run_eagerly=False)

        # Start training

        steps_per_epoch = 1281167 // batch_size
        if args.steps_per_epoch is not None:
            steps_per_epoch = args.steps_per_epoch
        validation_steps = 50000 // batch_size
        if args.validation_steps is not None:
            validation_steps = args.validation_steps

        total_steps = nb_epoch * steps_per_epoch
        resume_step = resume_from_epoch * steps_per_epoch

        lrate = get_lr_func(nb_epoch, lr_sched, initial_lr,
                            final_lr, warmup_steps, resume_step, total_steps)

        save_name = model_name if not model_name.endswith('.h5') else \
            os.path.split(model_name)[-1].split('.')[0].split('-')[0]
        model_ckpt = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
            monitor='train_loss')

        callbacks = [lrate, model_ckpt]
        if args.save_summary_steps > 0:
            callbacks += [TensorBoardWithHParamsV2(
                vars(args), log_dir=config.LOG_DIR, update_freq=args.save_summary_steps)]
            callbacks += [ExamplesPerSecondKerasHookV2(
                output_dir=config.LOG_DIR, every_n_steps=args.save_summary_steps, batch_size=args.batch_size)]

        if (args.evaluate_checkpoint_path is not None):
            model.load_weights(args.evaluate_checkpoint_path)
            results = model.evaluate(x=ds_valid, steps=validation_steps)
            print("Test loss, Test acc:", results)
            exit()

        if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)):
            model.load_weights(resume_from_checkpoint_path)

        with dump_callback(args.dump_config):
            model.fit(x=ds_train, y=None,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks,
                      initial_epoch=resume_from_epoch,
                      epochs=nb_epoch,
                      shuffle=not args.deterministic,
                      verbose=1 if not args.distributed else comm_rank() == 0,
                      validation_data=(ds_valid, None),
                      validation_steps=validation_steps,
                      )

        if not args.distributed or comm_rank() == 0:
            model.save(f'{config.SAVE_DIR}/{save_name}-model-final.h5')
Exemple #17
0
def main(argv):
    del argv  # Unused.

    # if given an efficentdet ckpt don't use default backbone ckpt
    if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None:
        print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format(
            FLAGS.ckpt, FLAGS.backbone_ckpt))
        FLAGS.backbone_ckpt = None

    if FLAGS.use_horovod is not None:
        if FLAGS.dump_all_ranks:
            FLAGS.model_dir += "/worker_" + str(hvd.rank())
        if not 'HOROVOD_CYCLE_TIME' in os.environ:
            os.environ['HOROVOD_CYCLE_TIME'] = '0.5'
        if not 'HABANA_HCCL_COMM_API' in os.environ:
            os.environ['HABANA_HCCL_COMM_API'] = '0'
        hvd_init()

    if not FLAGS.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

        if FLAGS.use_horovod:
            assert (horovod_enabled())

    set_env(use_amp=FLAGS.use_amp)

    # deterministic setting
    if FLAGS.sbs_test or FLAGS.deterministic:
        set_deterministic()

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if not FLAGS.val_json_file and not FLAGS.testdev_dir:
            raise RuntimeError(
                'You must specify --val_json_file or --testdev for evaluation.'
            )

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = config.get('image_size')
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
        if horovod_enabled():
            num_shards = hvd.size()
        else:
            num_shards = 1

    params = build_estimator_params('train', config, num_shards)
    # disabling input data scaling/flip manipulations.
    if FLAGS.sbs_test:
        sbs_params = dict(input_rand_hflip=False,
                          train_scale_min=1,
                          train_scale_max=1,
                          dropout_rate=0.0)
        params.update(sbs_params)

    tf_random_seed = 0 if FLAGS.deterministic else None
    run_config = build_estimator_config('train', config, num_shards,
                                        num_cores_per_replica,
                                        input_partition_dims)
    write_hparams_v1(FLAGS.model_dir, {
        'batch_size': FLAGS.train_batch_size,
        **FLAGS.flag_values_dict()
    })

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)

    if FLAGS.mode == 'train':
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=run_config,
                                           params=params)

        # for deterministic input, we pass to dataloader False for not manipulating input data
        is_training = not FLAGS.deterministic
        use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic

        input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=is_training,
                                          params=params,
                                          use_fake_data=use_fake_data,
                                          is_deterministic=FLAGS.deterministic)
        max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                        (FLAGS.train_batch_size * num_shards)) + 1

        # for sbs test, train under sbs callbacks
        if FLAGS.sbs_test:
            from TensorFlow.common.debug import dump_callback
            SBS_TEST_CONFIG = os.path.join(
                os.environ['TF_TESTS_ROOT'],
                "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json"
            )
            with dump_callback(SBS_TEST_CONFIG):
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)
        else:
            if FLAGS.ckpt is not None:
                train_estimator.train(input_fn=input_fn, steps=max_steps)
            else:
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)

    elif FLAGS.mode == 'eval':
        eval_params = build_estimator_params('eval', config, num_shards)
        eval_config = build_estimator_config('eval', config, num_shards,
                                             num_cores_per_replica,
                                             input_partition_dims)

        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=eval_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                write_summary(eval_results, ckpt, current_step)

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_params = build_estimator_params('train', config, num_shards)
        train_config = build_estimator_config('train', config, num_shards,
                                              num_cores_per_replica,
                                              input_partition_dims)
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=train_config,
                                           params=train_params)

        eval_estimator = None

        for cycle in range(FLAGS.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)

            train_estimator.train(
                input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                max_steps=(cycle + 1) *
                int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size))

            # synchronization point for all ranks
            if horovod_enabled():
                hvd.allreduce(tf.constant(0))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.

            if eval_estimator is None:
                eval_params = build_estimator_params('eval', config,
                                                     num_shards)
                eval_config = build_estimator_config('eval', config,
                                                     num_shards,
                                                     num_cores_per_replica,
                                                     input_partition_dims)
                eval_estimator = tf.estimator.tpu.TPUEstimator(
                    model_fn=model_fn_instance,
                    use_tpu=False,
                    train_batch_size=FLAGS.train_batch_size,
                    eval_batch_size=FLAGS.eval_batch_size,
                    config=eval_config,
                    params=eval_params)

            if is_rank0():
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)

                checkpoint_path = Path(FLAGS.model_dir)
                last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path),
                                                       latest_filename=None)
                current_step = int(os.path.basename(last_ckpt).split('-')[1])
                write_summary(eval_results, FLAGS.model_dir, current_step)
                logging.info('Evaluation results: %s', eval_results)

                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        pass

    else:
        logging.info('Mode not found.')
Exemple #18
0
def setup_module():
    htf.load_habana_module()
Exemple #19
0
def main():
    parser = CycleGANArgParser(is_demo=False)
    args = parser.parse_args()
    if not args.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        if args.habana_instance_norm:
            tfa.layers.InstanceNormalization = HabanaInstanceNormalization
        if args.data_type == 'bf16':
            tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
    if args.run_deterministic:
        tf.random.set_seed(12345)
    input_image_shape = (args.crop, args.crop, 3)
    input_transformation = TrasformInputs(orig_img_size=(
        args.resize, args.resize), input_img_size=(args.crop, args.crop))

    horovod = None
    if args.use_horovod:
        from TensorFlow.common.horovod_helpers import hvd as horovod
        horovod.init()
        if args.log_all_workers:
            args.logdir = os.path.join(args.logdir, f"worker_{horovod.rank()}")

    tfds.disable_progress_bar()
    # Load the horse-zebra dataset using tensorflow-datasets.
    if is_local_master(args.use_horovod, horovod):
        dataset, _ = tfds.load("cycle_gan/horse2zebra", data_dir=args.dataset_dir,
                               with_info=True, as_supervised=True, download=True)
        if args.use_horovod:
            horovod.broadcast(0, 0)  # nodes synchronization
    else:
        if args.use_horovod:
            horovod.broadcast(0, 0)
        dataset, _ = tfds.load(
            "cycle_gan/horse2zebra", data_dir=args.dataset_dir, with_info=True, as_supervised=True)

    train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
    test_horses, test_zebras = dataset["testA"], dataset["testB"]

    # Apply the preprocessing operations to the training data
    train_horses = (
        train_horses.map(
            input_transformation.preprocess_train_image, num_parallel_calls=1 if args.run_deterministic else autotune)
        .cache()
        .shuffle(args.buffer)
        .batch(args.batch_size, drop_remainder=True)
    )
    train_zebras = (
        train_zebras.map(
            input_transformation.preprocess_train_image, num_parallel_calls=1 if args.run_deterministic else autotune)
        .cache()
        .shuffle(args.buffer)
        .batch(args.batch_size, drop_remainder=True)
    )
    train_ds = tf.data.Dataset.zip((train_horses, train_zebras))
    test_ds = test_horses, test_zebras

    disc_X = get_discriminator(input_image_shape, name="discriminator_X")
    disc_Y = get_discriminator(input_image_shape, name="discriminator_Y")
    gen_X = get_resnet_generator(input_image_shape, name="generator_X")
    gen_Y = get_resnet_generator(input_image_shape, name="generator_Y")

    # Create cycle gan model
    cycle_gan_model = CycleGan(
        generator_X=gen_X, generator_Y=gen_Y, discriminator_X=disc_X, discriminator_Y=disc_Y
    )

    latest = None
    if args.restore:
        print(f"Trying to restore checkpoint from {args.logdir}")
        latest = tf.train.latest_checkpoint(args.logdir)

    if args.train:
        train(args, cycle_gan_model, train_ds, test_ds, latest, horovod)

    if args.test and is_master(args.use_horovod, horovod):
        eval(args, cycle_gan_model, test_ds, input_transformation, latest)