コード例 #1
0
def main(argv):
    parser = MNISTArgParser()
    flags = parser.parse_args(args=argv[1:])

    model_function = my_model

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_fn, loss_reduction=tf.losses.Reduction.MEAN)

    data_format = flags.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    mnist_classifier = tf.estimator.Estimator(model_fn=model_function,
                                              model_dir=flags.model_dir,
                                              params={
                                                  'data_format': data_format,
                                                  'multi_gpu': flags.multi_gpu
                                              })

    # Set up training and evaluation input functions.
    def train_input_fn():
        """Prepare data for training."""

        # When choosing shuffle buffer sizes, larger sizes result in better
        # randomness, while smaller sizes use less memory. MNIST is a small
        # enough dataset that we can easily shuffle the full epoch.
        ds = dataset.train(flags.data_dir)
        ds = ds.cache().shuffle(buffer_size=50000).batch(flags.batch_size)

        # Iterate through the dataset a set number (`epochs_between_evals`) of times
        # during each training session.
        ds = ds.repeat(flags.epochs_between_evals)
        return ds

    def eval_input_fn():
        return dataset.test(flags.data_dir).batch(
            flags.batch_size).make_one_shot_iterator().get_next()

    # Set up hook that outputs training logs every 100 steps.
    train_hooks = hooks_helper.get_train_hooks(flags.hooks,
                                               batch_size=flags.batch_size)

    # Train and evaluate model.
    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
        eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
        print('\nEvaluation results:\n\t%s\n' % eval_results)

    # Export the model
    if flags.export_dir is not None:
        image = tf.placeholder(tf.float32, [None, 28, 28])
        input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
            'image':
            image,
        })
        mnist_classifier.export_savedmodel(flags.export_dir, input_fn)
def resnet_main(flags, model_function, input_function, shape=None):
    """Shared main loop for ResNet Models.

    Args:
      flags: FLAGS object that contains the params for running. See
        ResnetArgParser for created flags.
      model_function: the function that instantiates the Model and builds the
        ops for train/eval. This will be passed directly into the estimator.
      input_function: the function that processes the dataset and returns a
        dataset that the estimator can train on. This will be wrapped with
        all the relevant flags for running and passed to estimator.
      shape: list of ints representing the shape of the images used for training.
        This is only used if flags.export_dir is passed.
    """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_function,
            loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
                                                  session_config=session_config)
    classifier = tf.estimator.Estimator(
        model_fn=model_function, model_dir=flags.model_dir, config=run_config,
        params={
            'resnet_size': flags.resnet_size,
            'data_format': flags.data_format,
            'batch_size': flags.batch_size,
            'multi_gpu': flags.multi_gpu,
            'version': flags.version,
        })

    if flags.benchmark_log_dir is not None:
        benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
        benchmark_logger.log_run_info("resnet")
    else:
        benchmark_logger = None

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)

        print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.no_lmk, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu)

        classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                         max_steps=flags.max_train_steps)

        print('Starting to evaluate.')
        # Evaluate the model and print results

        def input_fn_eval():
            return input_function(False, flags.no_lmk, flags.data_dir, flags.batch_size,
                                  1, flags.num_parallel_calls, flags.multi_gpu)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps)
        print(eval_results)

        if benchmark_logger:
            benchmark_logger.log_estimator_evaluation_result(eval_results)

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
コード例 #3
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file})
コード例 #4
0
def resnet_main(flags_obj,
                model_function,
                input_function,
                dataset_name,
                shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.
  """

    model_helpers.apply_clean(flags.FLAGS)

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
        allow_soft_placement=True)

    distribution_strategy = None
    if flags_obj.distribution_strategy == 'ps':
        print('==> Using ParameterServerStrategy')
        distribution_strategy = tf.contrib.distribute.ParameterServerStrategy(
            num_gpus_per_worker=flags_core.get_num_gpus(flags_obj))
    elif flags_obj.distribution_strategy == 'allreduce':
        print('==> Using CollectiveAllReduceStrategy')
        distribution_strategy = tf.contrib.distribute.CollectiveAllReduceStrategy(
            num_gpus_per_worker=flags_core.get_num_gpus(flags_obj))
    elif flags_obj.distribution_strategy == 'mirror':
        print('==> Using MirroredStrategy')
        distribution_strategy = tf.contrib.distribute.MirroredStrategy(
            num_gpus_per_worker=flags_core.get_num_gpus(flags_obj))
    else:
        print("==> Distribution Strategy {} is not valid".format(
            flags_obj.distribution_strategy))

    run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy,
                                        session_config=session_config,
                                        protocol="grpc+verbs",
                                        log_step_count_steps=1000)

    tf.logging.info("num_worker={}, batch_size={}, train_epochs={}".format(
        run_config.num_worker_replicas, flags_obj.batch_size,
        flags_obj.train_epochs))
    # initialize our model with all but the dense layer from pretrained resnet
    if flags_obj.pretrained_model_checkpoint_path is not None:
        warm_start_settings = tf.estimator.WarmStartSettings(
            flags_obj.pretrained_model_checkpoint_path,
            vars_to_warm_start='^(?!.*dense)')
    else:
        warm_start_settings = None

    classifier = tf.estimator.Estimator(
        model_fn=model_function,
        model_dir=flags_obj.model_dir,
        config=run_config,
        warm_start_from=warm_start_settings,
        params={
            'resnet_size': int(flags_obj.resnet_size),
            'data_format': flags_obj.data_format,
            'batch_size': flags_obj.batch_size,
            'resnet_version': int(flags_obj.resnet_version),
            'loss_scale': flags_core.get_loss_scale(flags_obj),
            'dtype': flags_core.get_tf_dtype(flags_obj),
            'fine_tune': flags_obj.fine_tune,
            'num_workers': run_config.num_worker_replicas
        })

    run_params = {
        'batch_size': flags_obj.batch_size * run_config.num_worker_replicas,
        'dtype': flags_core.get_tf_dtype(flags_obj),
        'resnet_size': flags_obj.resnet_size,
        'resnet_version': flags_obj.resnet_version,
        'synthetic_data': flags_obj.use_synthetic_data,
        'train_epochs': flags_obj.train_epochs,
    }
    if flags_obj.use_synthetic_data:
        dataset_name = dataset_name + '-synthetic'

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info('resnet',
                                  dataset_name,
                                  run_params,
                                  test_id=flags_obj.benchmark_test_id)

    train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks,
                                               model_dir=flags_obj.model_dir,
                                               batch_size=flags_obj.batch_size)

    def input_fn_train():
        return input_function(
            is_training=True,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=flags_obj.train_epochs,
            num_gpus=flags_core.get_num_gpus(flags_obj),
            dtype=flags_core.get_tf_dtype(flags_obj))

    def input_fn_eval():
        return input_function(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=distribution_utils.per_device_batch_size(
                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
            num_epochs=1,
            dtype=flags_core.get_tf_dtype(flags_obj))

    train_spec = tf.estimator.TrainSpec(input_fn=input_fn_train,
                                        hooks=train_hooks)
    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval,
                                      throttle_secs=1800,
                                      steps=None,
                                      start_delay_secs=10)
    if flags_obj.eval == 0:
        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
    else:
        while True:
            eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                               steps=50000 //
                                               flags_obj.batch_size)
            time.sleep(flags_obj.eval)
    '''
  if flags_obj.eval_only or not flags_obj.train_epochs:
    # If --eval_only is set, perform a single loop with zero train epochs.
    schedule, n_loops = [0], 1
  else:
    # Compute the number of times to loop while training. All but the last
    # pass will train for `epochs_between_evals` epochs, while the last will
    # train for the number needed to reach `training_epochs`. For instance if
    #   train_epochs = 25 and epochs_between_evals = 10
    # schedule will be set to [10, 10, 5]. That is to say, the loop will:
    #   Train for 10 epochs and then evaluate.
    #   Train for another 10 epochs and then evaluate.
    #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
    n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals)
    schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
    schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1])  # over counting.

  for cycle_index, num_train_epochs in enumerate(schedule):
    tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))

    if num_train_epochs:
      classifier.train(input_fn=lambda: input_fn_train(num_train_epochs),
                       hooks=train_hooks, max_steps=flags_obj.max_train_steps)

    tf.logging.info('Starting to evaluate.')

    # flags_obj.max_train_steps is generally associated with testing and
    # profiling. As a result it is frequently called with synthetic data, which
    # will iterate forever. Passing steps=flags_obj.max_train_steps allows the
    # eval (which is generally unimportant in those circumstances) to terminate.
    # Note that eval will run for max_train_steps each loop, regardless of the
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags_obj.max_train_steps)

    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags_obj.stop_threshold, eval_results['accuracy']):
      break

  '''
    if flags_obj.export_dir is not None:
        # Exports a saved model for the given classifier.
        if flags_obj.image_bytes_as_serving_input:
            input_receiver_fn = functools.partial(image_bytes_serving_input_fn,
                                                  shape)
        else:
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags_obj.batch_size)
        classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)