コード例 #1
0
ファイル: training_v2.py プロジェクト: hooman67/tensorflow
  def fit(
      self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1,
      callbacks=None, validation_split=0., validation_data=None, shuffle=True,
      class_weight=None, sample_weight=None, initial_epoch=0,
      steps_per_epoch=None, validation_steps=None, validation_freq=1,
      max_queue_size=10, workers=1, use_multiprocessing=False, **kwargs):
    batch_size = model._validate_or_infer_batch_size(
        batch_size, steps_per_epoch, x)

    strategy = _get_distribution_strategy(model)
    batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
        strategy,
        x,
        batch_size,
        steps_per_epoch,
        ModeKeys.TRAIN,
        validation_split=validation_split)
    dist_utils.validate_callbacks(input_callbacks=callbacks,
                                  optimizer=model.optimizer)
    # Enter tf.distribute.Strategy scope.
    with strategy.scope():
      training_data_adapter, validation_adapter = _process_training_inputs(
          model,
          x,
          y,
          batch_size=batch_size,
          epochs=epochs,
          sample_weights=sample_weight,
          class_weights=class_weight,
          validation_split=validation_split,
          steps_per_epoch=steps_per_epoch,
          shuffle=shuffle,
          validation_data=validation_data,
          validation_steps=validation_steps,
          distribution_strategy=strategy,
          max_queue_size=max_queue_size,
          workers=workers,
          use_multiprocessing=use_multiprocessing)

      total_samples = _get_total_number_of_samples(training_data_adapter)
      use_sample = total_samples is not None
      do_validation = (validation_adapter is not None)

      recreate_training_iterator = (
          training_data_adapter.should_recreate_iterator(steps_per_epoch))
      if not steps_per_epoch:
        # TODO(b/139762795): Add step inference for when steps is None to
        # prevent end of sequence warning message.
        steps_per_epoch = training_data_adapter.get_size()

      # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
      training_context = TrainingContext()

      training_dataset = training_data_adapter.get_dataset()
      # Raise an error if steps_per_epoch isn't specified but the dataset
      # is infinite.
      # TODO(scottzhu): This check should probably happen in the adapter
      inferred_steps = training_utils.infer_steps_for_dataset(
          model,
          training_dataset,
          steps_per_epoch,
          steps_name='steps_per_epoch',
          epochs=0)

      steps_per_epoch = (
          inferred_steps if steps_per_epoch is None else steps_per_epoch)

      training_dataset = strategy.experimental_distribute_dataset(
          training_dataset)

      training_function = training_v2_utils._get_or_make_execution_function(
          model, ModeKeys.TRAIN)

      training_data_iter = None
      if do_validation:
        validation_dataset = validation_adapter.get_dataset()
        if not validation_steps:
          # Raise an error if validation_steps isn't specified but the
          # validation dataset is infinite.
          validation_steps = (
              validation_adapter.get_size() or
              training_utils.infer_steps_for_dataset(
                  model,
                  validation_dataset,
                  validation_steps,
                  steps_name='validation_steps'))
        eval_function = training_v2_utils._get_or_make_execution_function(
            model, ModeKeys.TEST)
        eval_data_iter = None
        validation_dataset = strategy.experimental_distribute_dataset(
            validation_dataset)
        val_total_samples = _get_total_number_of_samples(validation_adapter)
      else:
        val_total_samples = None

      if verbose and (total_samples or steps_per_epoch):
        _print_train_info(total_samples, steps_per_epoch, val_total_samples,
                          validation_steps)

      training_callbacks = cbks.configure_callbacks(
          callbacks,
          model,
          do_validation=do_validation,
          batch_size=batch_size,
          epochs=epochs,
          steps_per_epoch=steps_per_epoch,
          samples=total_samples or steps_per_epoch,
          count_mode='samples' if use_sample else 'steps',
          verbose=0,  # Handle ProgBarLogger separately in this loop.
          mode=ModeKeys.TRAIN)

      with training_context.on_start(model, training_callbacks, use_sample,
                                     verbose, ModeKeys.TRAIN):

        initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
            initial_epoch, ModeKeys.TRAIN)

        for epoch in range(initial_epoch, epochs):
          if training_context.callbacks.model.stop_training:
            break

          # Training
          with training_context.on_epoch(epoch, ModeKeys.TRAIN) as epoch_logs:
            model.reset_metrics()
            if training_data_iter is None or recreate_training_iterator:
              if (training_data_iter is not None and
                  distribution_strategy_context.has_strategy()):
                # TODO(kaftan): remove this when MultiDeviceIterator is a
                ## compositetensor (unless this is more efficient)
                training_data_iter._initializer  # pylint: disable=pointless-statement
              else:
                training_data_iter = iter(training_dataset)

            training_result = run_one_epoch(
                model,
                training_data_iter,
                training_function,
                dataset_size=training_data_adapter.get_size(),
                batch_size=training_data_adapter.batch_size(),
                strategy=strategy,
                steps_per_epoch=steps_per_epoch,
                num_samples=total_samples,
                mode=ModeKeys.TRAIN,
                training_context=training_context,
                total_epochs=epochs)
            cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)

            # In the case of steps_per_epoch = None, the final cardinality will
            # be determined when the inputs are fully consumed (eg dataset or
            # generator). Update the steps_per_epoch to the new value.
            if (steps_per_epoch is None
                and training_context.progbar.progbar.target is not None):
              steps_per_epoch = training_context.progbar.progbar.target

            # Evaluation
            if (do_validation and
                training_utils.should_run_validation(validation_freq, epoch) and
                not training_callbacks.model.stop_training):
              if (eval_data_iter is not None and
                  distribution_strategy_context.has_strategy()):
                # TODO(kaftan): remove this when MultiDeviceIterator is a
                ## compositetensor (unless this is more efficient)
                eval_data_iter._initializer  # pylint: disable=pointless-statement
              else:
                eval_data_iter = iter(validation_dataset)

              validation_callbacks = cbks.configure_callbacks(
                  training_callbacks,
                  model,
                  batch_size=batch_size,
                  epochs=1,
                  steps_per_epoch=validation_steps,
                  samples=val_total_samples or validation_steps,
                  count_mode='samples' if use_sample else 'steps',
                  verbose=0,  # Handle ProgBarLogger separately in this loop.
                  mode=ModeKeys.TEST)

              eval_context = TrainingContext()
              with eval_context.on_start(
                  model,
                  validation_callbacks,
                  use_sample,
                  verbose=0,
                  mode=ModeKeys.TEST):
                with eval_context.on_epoch(epoch, ModeKeys.TEST):
                  model.reset_metrics()
                  eval_result = run_one_epoch(
                      model,
                      eval_data_iter,
                      eval_function,
                      dataset_size=validation_adapter.get_size(),
                      batch_size=validation_adapter.batch_size(),
                      strategy=strategy,
                      steps_per_epoch=validation_steps,
                      num_samples=val_total_samples,
                      mode=ModeKeys.TEST,
                      training_context=eval_context,
                      total_epochs=1)
                  cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST,
                                 prefix='val_')

    return model.history
コード例 #2
0
def model_iteration(model,
                    data,
                    steps_per_epoch=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    validation_data=None,
                    validation_steps=None,
                    validation_freq=1,
                    class_weight=None,
                    max_queue_size=10,
                    workers=1,
                    use_multiprocessing=False,
                    shuffle=False,
                    initial_epoch=0,
                    mode=ModeKeys.TRAIN,
                    batch_size=None,
                    steps_name='steps',
                    **kwargs):
  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
        `(x, y, sample_weights)`) or a generator or
        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      epochs: Number of times to iterate over the data.
      verbose: Verbosity mode, 0, 1 or 2.
      callbacks: List of callbacks to be called during training.
      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
        `(x, y)` or `(x, y, sample_weights)`) or a generator or
        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
      validation_steps: Total number of steps (batches of samples) before
        declaring validation finished.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      class_weight: Dictionary mapping class indices to a weight for the class.
      max_queue_size: Integer. Maximum size for the generator queue. If
        unspecified, `max_queue_size` will default to 10.
      workers: Integer. Maximum number of processes to spin up when using
        process-based threading. If unspecified, `workers` will default to 1. If
        0, will execute the generator on the main thread.
      use_multiprocessing: Boolean. If `True`, use process-based threading. If
        unspecified, `use_multiprocessing` will default to `False`. Note that
        because this implementation relies on multiprocessing, you should not
        pass non-picklable arguments to the generator as they can't be passed
        easily to children processes.
      shuffle: Boolean. Whether to shuffle the order of the batches at the
        beginning of each epoch. Only used with instances of `Sequence`
        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
        `None`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run).
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      batch_size: Integer batch size or None if unknown. Will only be used if
        `data` is in NumPy/Tensor format.
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility. `steps` is
        accepted as an alias for `steps_per_epoch`.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
  if 'steps' in kwargs:
    steps_per_epoch = kwargs['steps']

  # Determine the number of steps per epoch and whether we should reset the
  # dataset at the end of each epoch.
  reset_dataset_after_each_epoch = False
  original_dataset = None
  is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1))
  if is_dataset:
    original_dataset = data
    if steps_per_epoch is None:
      reset_dataset_after_each_epoch = True
      steps_per_epoch = training_utils.infer_steps_for_dataset(
          data, steps_per_epoch, epochs=epochs, steps_name=steps_name)

  # Convert to a format that supports `next(generator)`.
  generator, steps_per_epoch = convert_to_generator_like(
      data,
      steps_per_epoch=steps_per_epoch,
      batch_size=batch_size,
      epochs=epochs - initial_epoch,
      shuffle=shuffle)

  do_validation = validation_data is not None
  is_sequence = isinstance(generator, data_utils.Sequence)
  _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                      steps_per_epoch, validation_data, validation_steps, mode,
                      kwargs)

  batch_function = _make_execution_function(
      model, mode, class_weight=class_weight)

  # Create the queue for the generator.
  enqueuer = None
  if not is_dataset:
    generator, enqueuer = _make_enqueued_generator(
        generator,
        workers=workers,
        use_multiprocessing=use_multiprocessing,
        max_queue_size=max_queue_size,
        shuffle=shuffle)

  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
      data, steps_per_epoch)

  count_mode = 'steps' if use_steps else 'samples'
  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      batch_size=batch_size,
      samples=num_samples_or_steps,
      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
      mode=mode)
  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
  progbar = training_utils.get_progbar(model, count_mode)
  progbar.params = callbacks.params
  progbar.params['verbose'] = verbose

  if mode == ModeKeys.PREDICT:
    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
  else:
    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)

  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
  if should_set_learning_phase:
    old_learning_phase = backend.learning_phase()
    backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0)

  callbacks.model.stop_training = False
  callbacks._call_begin_hook(mode)
  progbar.on_train_begin()

  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)

  for epoch in range(initial_epoch, epochs):
    if callbacks.model.stop_training:
      break

    # Setup work for each epoch.
    model.reset_metrics()
    epoch_logs = {}
    if mode == ModeKeys.TRAIN:
      callbacks.on_epoch_begin(epoch, epoch_logs)
    progbar.on_epoch_begin(epoch, epoch_logs)

    if steps_per_epoch is None:
      # Loop over dataset until `OutOfRangeError` is raised.
      target_steps = np.inf
    else:
      # Loop over dataset for the specified number of steps.
      target_steps = steps_per_epoch

    step = 0
    while step < target_steps:
      batch_data = _get_next_batch(generator, mode)
      if batch_data is None:
        if is_dataset:
          # The dataset passed by the user ran out of batches.
          # Now we know the cardinality of the dataset.
          # If steps_per_epoch was specified, then running out of data is
          # unexpected, so we stop training and inform the user.
          if steps_per_epoch:
            callbacks.model.stop_training = True
            logging.warning(
                'Your dataset ran out of data; interrupting training. '
                'Make sure that your dataset can generate at least '
                '`%s * epochs` batches (in this case, %d batches). '
                'You may need to use the repeat() function when '
                'building your dataset.'
                % (steps_name, steps_per_epoch * epochs))
          elif step > 0:
            steps_per_epoch = step
            aggregator.num_samples_or_steps = steps_per_epoch
            if mode == ModeKeys.TRAIN:
              progbar.params['steps'] = steps_per_epoch
              progbar.progbar.target = steps_per_epoch
        else:
          # We ran out of batches while the user passed an iterator (legacy).
          callbacks.model.stop_training = True
          logging.warning(
              'Your dataset iterator ran out of data; '
              'interrupting training. Make sure that your iterator '
              'can generate at least `%s * epochs` '
              'batches (in this case, %d batches). You may need to'
              'use the repeat() function when building your '
              'dataset.' % (steps_name, steps_per_epoch * epochs))
        break

      # `batch_size` used for validation data if validation
      # data is NumPy/EagerTensors.
      batch_size = int(nest.flatten(batch_data)[0].shape[0])

      # Callbacks batch begin.
      batch_logs = {'batch': step, 'size': batch_size}
      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
      progbar.on_batch_begin(step, batch_logs)

      is_deferred = not model._is_compiled
      batch_outs = batch_function(*batch_data)
      if not isinstance(batch_outs, list):
        batch_outs = [batch_outs]

      if step == 0:
        aggregator.create(batch_outs)

        if is_deferred:
          # Set callbacks params. We do this here when model is compiled only
          # in the first iteration of this loop (deferred build scenario).
          cbks.set_callback_parameters(
              callbacks,
              model,
              do_validation=do_validation,
              batch_size=batch_size,
              epochs=epochs,
              steps_per_epoch=steps_per_epoch,
              samples=num_samples_or_steps,
              verbose=verbose,
              mode=mode)

          progbar.params = callbacks.params
          progbar.params['verbose'] = verbose

      # Aggregate results.
      aggregator.aggregate(batch_outs)

      # Callbacks batch end.
      batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
      progbar.on_batch_end(step, batch_logs)
      step += 1

      if callbacks.model.stop_training:
        break

    aggregator.finalize()
    results = aggregator.results
    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
    if len(results) == 1:
      results = results[0]

    # Run the test loop every epoch during training.
    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch) and
        not callbacks.model.stop_training):
      val_results = model_iteration(
          model,
          validation_data,
          steps_per_epoch=validation_steps,
          batch_size=batch_size,
          class_weight=class_weight,
          workers=workers,
          use_multiprocessing=use_multiprocessing,
          max_queue_size=max_queue_size,
          callbacks=callbacks,
          verbose=0,
          mode=ModeKeys.TEST,
          steps_name='validation_steps')

      if not isinstance(val_results, list):
        val_results = [val_results]
      epoch_logs = cbks.make_logs(
          model, epoch_logs, val_results, mode, prefix='val_')

    if mode == ModeKeys.TRAIN:
      # Epochs only apply to `fit`.
      callbacks.on_epoch_end(epoch, epoch_logs)
    progbar.on_epoch_end(epoch, epoch_logs)

    # Recreate dataset iterator for the next epoch.
    if reset_dataset_after_each_epoch and epoch < epochs - 1:
      generator = dataset_ops.make_one_shot_iterator(original_dataset)

  callbacks._call_end_hook(mode)

  if enqueuer is not None:
    enqueuer.stop()

  if should_set_learning_phase:
    backend.set_eager_learning_phase(old_learning_phase)

  if mode == ModeKeys.TRAIN:
    return model.history
  return results
コード例 #3
0
def model_iteration(model,
                    data,
                    steps_per_epoch=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    validation_data=None,
                    validation_steps=None,
                    validation_freq=1,
                    class_weight=None,
                    max_queue_size=10,
                    workers=1,
                    use_multiprocessing=False,
                    shuffle=False,
                    initial_epoch=0,
                    mode=ModeKeys.TRAIN,
                    batch_size=None,
                    steps_name='steps',
                    **kwargs):
    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
        `(x, y, sample_weights)`) or a generator or
        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      epochs: Number of times to iterate over the data.
      verbose: 0, 1, or 2. Verbosity mode.
        0 = silent, 1 = progress bar, 2 = one line per epoch.
        Note that the progress bar is not particularly useful when
        logged to a file, so verbose=2 is recommended when not running
        interactively (eg, in a production environment).
      callbacks: List of callbacks to be called during training.
      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
        `(x, y)` or `(x, y, sample_weights)`) or a generator or
        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
      validation_steps: Total number of steps (batches of samples) before
        declaring validation finished.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      class_weight: Dictionary mapping class indices to a weight for the class.
      max_queue_size: Integer. Maximum size for the generator queue. If
        unspecified, `max_queue_size` will default to 10.
      workers: Integer. Maximum number of processes to spin up when using
        process-based threading. If unspecified, `workers` will default to 1. If
        0, will execute the generator on the main thread.
      use_multiprocessing: Boolean. If `True`, use process-based threading. If
        unspecified, `use_multiprocessing` will default to `False`. Note that
        because this implementation relies on multiprocessing, you should not
        pass non-picklable arguments to the generator as they can't be passed
        easily to children processes.
      shuffle: Boolean. Whether to shuffle the order of the batches at the
        beginning of each epoch. Only used with instances of `Sequence`
        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
        `None`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run).
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      batch_size: Integer batch size or None if unknown. Will only be used if
        `data` is in NumPy/Tensor format.
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility. `steps` is
        accepted as an alias for `steps_per_epoch`.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
    if 'steps' in kwargs:
        steps_per_epoch = kwargs['steps']

    # Determine the number of steps per epoch and whether we should reset the
    # dataset at the end of each epoch.
    reset_dataset_after_each_epoch = False
    original_dataset = None
    is_dataset = isinstance(data,
                            (dataset_ops.DatasetV2, dataset_ops.DatasetV1))
    if is_dataset:
        original_dataset = data
        if steps_per_epoch is None:
            reset_dataset_after_each_epoch = True
            steps_per_epoch = training_utils.infer_steps_for_dataset(
                model,
                data,
                steps_per_epoch,
                epochs=epochs,
                steps_name=steps_name)

    # Convert to a format that supports `next(generator)`.
    generator, steps_per_epoch = convert_to_generator_like(
        data,
        steps_per_epoch=steps_per_epoch,
        batch_size=batch_size,
        epochs=epochs - initial_epoch,
        shuffle=shuffle)

    do_validation = validation_data is not None
    is_sequence = isinstance(generator, data_utils.Sequence)
    _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                        steps_per_epoch, validation_data, validation_steps,
                        mode, kwargs)

    batch_function = _make_execution_function(model,
                                              mode,
                                              class_weight=class_weight)

    # Create the queue for the generator.
    enqueuer = None
    if not is_dataset:
        generator, enqueuer = _make_enqueued_generator(
            generator,
            workers=workers,
            use_multiprocessing=use_multiprocessing,
            max_queue_size=max_queue_size,
            shuffle=shuffle)

    num_samples_or_steps, use_steps = _get_num_samples_or_steps(
        data, steps_per_epoch)

    count_mode = 'steps' if use_steps else 'samples'
    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         batch_size=batch_size,
                                         samples=num_samples_or_steps,
                                         count_mode=count_mode,
                                         verbose=verbose,
                                         mode=mode)

    if mode == ModeKeys.PREDICT:
        aggregator = training_utils.OutputsAggregator(True,
                                                      steps=steps_per_epoch)
    else:
        aggregator = training_utils.MetricsAggregator(True,
                                                      steps=steps_per_epoch)

    should_set_learning_phase = context.executing_eagerly(
    ) and model.run_eagerly
    if should_set_learning_phase:
        learning_phase_scope = backend.eager_learning_phase_scope(
            1 if mode == ModeKeys.TRAIN else 0)
        learning_phase_scope.__enter__()

    callbacks.model.stop_training = False
    callbacks._call_begin_hook(mode)

    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
        initial_epoch, mode)

    for epoch in range(initial_epoch, epochs):
        if callbacks.model.stop_training:
            break

        # Setup work for each epoch.
        model.reset_metrics()
        epoch_logs = {}
        if mode == ModeKeys.TRAIN:
            callbacks.on_epoch_begin(epoch, epoch_logs)

        if steps_per_epoch is None:
            # Loop over dataset until `OutOfRangeError` is raised.
            target_steps = np.inf
        else:
            # Loop over dataset for the specified number of steps.
            target_steps = steps_per_epoch

        step = 0
        while step < target_steps:
            batch_data = _get_next_batch(generator)
            if batch_data is None:
                if is_dataset:
                    # The dataset passed by the user ran out of batches.
                    # Now we know the cardinality of the dataset.
                    # If steps_per_epoch was specified, then running out of data is
                    # unexpected, so we stop training and inform the user.
                    if steps_per_epoch:
                        callbacks.model.stop_training = True
                        logging.warning(
                            'Your dataset ran out of data; interrupting training. '
                            'Make sure that your dataset can generate at least '
                            '`%s * epochs` batches (in this case, %d batches). '
                            'You may need to use the repeat() function when '
                            'building your dataset.' %
                            (steps_name, steps_per_epoch * epochs))
                    elif step > 0:
                        steps_per_epoch = step
                        aggregator.steps = steps_per_epoch
                else:
                    # We ran out of batches while the user passed an iterator (legacy).
                    callbacks.model.stop_training = True
                    logging.warning(
                        'Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your iterator '
                        'can generate at least `%s * epochs` '
                        'batches (in this case, %d batches). You may need to'
                        'use the repeat() function when building your '
                        'dataset.' % (steps_name, steps_per_epoch * epochs))
                break

            # `batch_size` used for validation data if validation
            # data is NumPy/EagerTensors.
            batch_size = int(nest.flatten(batch_data)[0].shape[0])

            # Callbacks batch begin.
            batch_logs = {'batch': step, 'size': batch_size}
            callbacks._call_batch_hook(mode, 'begin', step, batch_logs)

            is_deferred = not model._is_compiled
            batch_outs = batch_function(*batch_data)
            if not isinstance(batch_outs, list):
                batch_outs = [batch_outs]

            if step == 0:
                aggregator.create(batch_outs)

                if is_deferred:
                    # Set callbacks params. We do this here when model is compiled only
                    # in the first iteration of this loop (deferred build scenario).
                    cbks.set_callback_parameters(
                        callbacks,
                        model,
                        do_validation=do_validation,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=steps_per_epoch,
                        samples=num_samples_or_steps,
                        verbose=verbose,
                        mode=mode)

            # Aggregate results.
            aggregator.aggregate(batch_outs)

            # Callbacks batch end.
            batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
            callbacks._call_batch_hook(mode, 'end', step, batch_logs)
            step += 1

            if callbacks.model.stop_training:
                break

        aggregator.finalize()
        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        # Run the test loop every epoch during training.
        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch) and not callbacks.model.stop_training):
            val_results = model_iteration(
                model,
                validation_data,
                steps_per_epoch=validation_steps,
                batch_size=batch_size,
                class_weight=class_weight,
                workers=workers,
                use_multiprocessing=use_multiprocessing,
                max_queue_size=max_queue_size,
                callbacks=callbacks,
                verbose=verbose,
                mode=ModeKeys.TEST,
                steps_name='validation_steps')

            if not isinstance(val_results, list):
                val_results = [val_results]
            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')

        if mode == ModeKeys.TRAIN:
            # Epochs only apply to `fit`.
            callbacks.on_epoch_end(epoch, epoch_logs)

        # Recreate dataset iterator for the next epoch.
        if reset_dataset_after_each_epoch and epoch < epochs - 1:
            generator = dataset_ops.make_one_shot_iterator(original_dataset)

    model._successful_loop_finish = True
    callbacks._call_end_hook(mode)

    if enqueuer is not None:
        enqueuer.stop()

    if should_set_learning_phase:
        learning_phase_scope.__exit__(None, None, None)

    if mode == ModeKeys.TRAIN:
        return model.history
    return results
コード例 #4
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs):
    """Clones the model and calls make_fit_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_fit_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TRAIN),))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is None:
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model.')
  steps_per_run = K.variable(
      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
      dtype='int32',
      name='steps_per_run')

  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  steps_to_run = [current_strategy.extended.steps_per_run] * (
      steps_per_epoch // current_strategy.extended.steps_per_run)
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    for step_count in steps_to_run:
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
コード例 #5
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
    """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=1)
    scope.__enter__()

    def _per_device_fit_function(model):
        model._make_fit_function()
        return (model._fit_function.inputs, model._fit_function.outputs,
                model._fit_function.updates_op,
                model._fit_function.session_kwargs)

    out_labels = model.metrics_names or []

    def step_fn(ctx, inputs):
        """Clones the model and calls make_fit_function."""
        inputs, targets = inputs
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model,
                current_strategy,
                ModeKeys.TRAIN,
                inputs=inputs,
                targets=targets)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, ModeKeys.TRAIN, inputs, targets)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_fit_function, args=(model._distributed_model_train, ))
        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)
        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_fit_function',
                                 **all_session_args)

        for label, output in zip(out_labels, combined_fn.outputs):
            if label == 'loss':
                reduce_op = distribute_lib.get_loss_reduction()
            else:
                # We reduce all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                reduce_op = ds_reduce_util.ReduceOp.MEAN
            ctx.set_last_step_output(label, output, reduce_op)

        # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
        # feed_dict, session kwargs, run options, run_metadata for now. These should
        # be handled appropriately
        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_stateful_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    if steps_per_epoch is None:
        raise ValueError('`steps_per_epoch` should be specified when calling '
                         '`fit` on the model.')
    steps_per_run = K.variable(value=min(
        steps_per_epoch, current_strategy.extended.steps_per_run),
                               dtype='int32',
                               name='steps_per_run')

    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=steps_per_run,
        initial_loop_values=initial_loop_values)

    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    do_validation = bool(validation_steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, ModeKeys.TRAIN)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose)

    # Calculate the steps each time on the device.
    steps_to_run = [current_strategy.extended.steps_per_run] * (
        steps_per_epoch // current_strategy.extended.steps_per_run)
    if steps_per_epoch % current_strategy.extended.steps_per_run:
        steps_to_run.append(steps_per_epoch %
                            current_strategy.extended.steps_per_run)

    callbacks.on_train_begin()
    for epoch in range(initial_epoch, epochs):
        distributed_training_utils._reset_metrics(model)
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        for step_count in steps_to_run:
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks.on_batch_begin(step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                steps_per_run.load(step_count, K.get_session())
                prev_step_count = step_count
            try:
                _, outputs = K.get_session().run([train_op, output_tensors])
            except errors.OutOfRangeError:
                logging.warning(
                    'Your dataset iterator ran out of data; '
                    'interrupting training. Make sure that your dataset '
                    'can generate at least `steps_per_epoch * epochs` '
                    'batches (in this case, %d batches).' % steps_per_epoch *
                    epochs)
                break

            batch_logs.update(outputs)
            callbacks.on_batch_end(step_index, batch_logs)
            step_index = step_index + step_count
            if callbacks.model.stop_training:
                break

        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch)):
            logging.info('Running validation at fit epoch: %s', epoch)

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                distributed_training_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
                model,
                val_dataset,
                steps=validation_steps,
                verbose=verbose)
            if not isinstance(val_outs, list):
                val_outs = [val_outs]
            # Same labels assumed.
            for label, val_out in zip(out_labels, val_outs):
                epoch_logs['val_' + label] = val_out

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks.on_train_end()

    if model._compile_distribution:
        # Copy the weights back from the replicated model to the original model.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)
    scope.__exit__(None, None, None)
    return model.history
コード例 #6
0
    def fit(self,
            model,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            verbose=1,
            callbacks=None,
            validation_split=0.,
            validation_data=None,
            shuffle=True,
            class_weight=None,
            sample_weight=None,
            initial_epoch=0,
            steps_per_epoch=None,
            validation_steps=None,
            validation_freq=1,
            **kwargs):
        batch_size = model._validate_or_infer_batch_size(
            batch_size, steps_per_epoch, x)

        strategy = _get_distribution_strategy(model)
        batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
            strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN)
        dist_utils.validate_callbacks(input_callbacks=callbacks,
                                      optimizer=model.optimizer)
        # Enter tf.distribute.Strategy scope.
        with strategy.scope():
            training_data_adapter, validation_adapter = _process_training_inputs(
                model,
                x,
                y,
                batch_size=batch_size,
                sample_weights=sample_weight,
                class_weights=class_weight,
                validation_split=validation_split,
                steps_per_epoch=steps_per_epoch,
                shuffle=shuffle,
                validation_data=validation_data,
                validation_steps=validation_steps,
                distribution_strategy=strategy)

            total_samples = _get_total_number_of_samples(training_data_adapter)
            use_sample = total_samples is not None
            do_validation = (validation_adapter is not None)

            if not steps_per_epoch:
                steps_per_epoch = training_data_adapter.get_size()

            # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
            training_context = TrainingContext()

            initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
                initial_epoch, ModeKeys.TRAIN)

            training_dataset = training_data_adapter.get_dataset()
            # Raise an error if steps_per_epoch isn't specified but the dataset
            # is infinite.
            # TODO(scottzhu): This check should probably happen in the adapter
            training_utils.infer_steps_for_dataset(
                training_dataset,
                steps_per_epoch,
                steps_name='steps_per_epoch',
                epochs=0)

            training_dataset = strategy.experimental_distribute_dataset(
                training_dataset)

            _update_sample_weight_mode(model, ModeKeys.TRAIN, training_dataset)
            training_function = training_v2_utils._get_or_make_execution_function(
                model, ModeKeys.TRAIN)

            training_data_iter = None
            # Only recreate iterator when the data has a fixed length, which will be
            # fully consumed every epoch, or has a unknown length (dataset, generator)
            # and will be fully consumed (steps_per_epoch is None)
            recreate_training_iterator = (training_data_adapter.get_size()
                                          is not None
                                          or steps_per_epoch is None)

            if do_validation:
                if not validation_steps:
                    validation_steps = validation_adapter.get_size()
                eval_function = training_v2_utils._get_or_make_execution_function(
                    model, ModeKeys.TEST)
                eval_data_iter = None

                validation_dataset = validation_adapter.get_dataset()
                # Raise an error if validation_steps isn't specified but the validation
                # dataset is infinite.
                # TODO(scottzhu): This check should probably happen in the adapter
                training_utils.infer_steps_for_dataset(
                    validation_dataset,
                    validation_steps,
                    steps_name='validation_steps',
                    epochs=0)
                validation_dataset = strategy.experimental_distribute_dataset(
                    validation_dataset)

            callbacks = cbks.configure_callbacks(
                callbacks,
                model,
                do_validation=do_validation,
                batch_size=batch_size,
                epochs=epochs,
                steps_per_epoch=steps_per_epoch,
                samples=total_samples,
                count_mode='samples' if use_sample else 'steps',
                verbose=0,  # Handle ProgBarLogger separately in this loop.
                mode=ModeKeys.TRAIN)

            with training_context.on_start(model, callbacks, use_sample,
                                           verbose, ModeKeys.TRAIN):
                # TODO(scottzhu): Handle TPUStrategy training loop
                for epoch in range(initial_epoch, epochs):
                    if training_context.callbacks.model.stop_training:
                        break

                    # Training
                    with training_context.on_epoch(
                            epoch, ModeKeys.TRAIN) as epoch_logs:
                        model.reset_metrics()
                        if training_data_iter is None or recreate_training_iterator:
                            if (training_data_iter is not None
                                    and distribution_strategy_context.
                                    has_strategy()):
                                # TODO(kaftan): remove this when MultiDeviceIterator is a
                                ## compositetensor (unless this is more efficient)
                                training_data_iter._initializer  # pylint: disable=pointless-statement
                            else:
                                training_data_iter = iter(training_dataset)

                        training_result = run_one_epoch(
                            model,
                            training_data_iter,
                            training_function,
                            dataset_size=training_data_adapter.get_size(),
                            batch_size=training_data_adapter.batch_size(),
                            strategy=strategy,
                            steps_per_epoch=steps_per_epoch,
                            num_samples=total_samples,
                            mode=ModeKeys.TRAIN,
                            training_context=training_context,
                            total_epochs=epochs)
                        cbks.make_logs(model, epoch_logs, training_result,
                                       ModeKeys.TRAIN)

                        # Evaluation
                        if (do_validation
                                and training_utils.should_run_validation(
                                    validation_freq, epoch)
                                and not callbacks.model.stop_training):
                            if (eval_data_iter is not None
                                    and distribution_strategy_context.
                                    has_strategy()):
                                # TODO(kaftan): remove this when MultiDeviceIterator is a
                                ## compositetensor (unless this is more efficient)
                                eval_data_iter._initializer  # pylint: disable=pointless-statement
                            else:
                                eval_data_iter = iter(validation_dataset)

                            val_total_samples = _get_total_number_of_samples(
                                validation_adapter)
                            eval_context = TrainingContext()
                            with eval_context.on_start(model,
                                                       callbacks,
                                                       use_sample,
                                                       verbose=0,
                                                       mode=ModeKeys.TEST):
                                with eval_context.on_epoch(
                                        epoch, ModeKeys.TEST):
                                    model.reset_metrics()
                                    eval_result = run_one_epoch(
                                        model,
                                        eval_data_iter,
                                        eval_function,
                                        dataset_size=validation_adapter.
                                        get_size(),
                                        batch_size=validation_adapter.
                                        batch_size(),
                                        strategy=strategy,
                                        steps_per_epoch=validation_steps,
                                        num_samples=val_total_samples,
                                        mode=ModeKeys.TEST,
                                        training_context=eval_context,
                                        total_epochs=1)
                                    cbks.make_logs(model,
                                                   epoch_logs,
                                                   eval_result,
                                                   ModeKeys.TEST,
                                                   prefix='val_')

        return model.history
コード例 #7
0
    def fit(self,
            model,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            verbose=1,
            callbacks=None,
            validation_split=0.,
            validation_data=None,
            shuffle=True,
            class_weight=None,
            sample_weight=None,
            initial_epoch=0,
            steps_per_epoch=None,
            validation_steps=None,
            validation_freq=1,
            **kwargs):
        batch_size = model._validate_or_infer_batch_size(
            batch_size, steps_per_epoch, x)

        strategy = _get_distribution_strategy(model)
        if strategy:
            batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
                strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN)
            dist_utils.validate_callbacks(input_callbacks=callbacks,
                                          optimizer=model.optimizer)
            # Enter tf.distribute.Strategy scope.
            scope = dist_utils.distributed_scope(strategy=strategy,
                                                 learning_phase=1)
            scope.__enter__()

        training_data_adapter, validation_adapter = _process_training_inputs(
            model,
            x,
            y,
            batch_size=batch_size,
            sample_weights=sample_weight,
            class_weights=class_weight,
            validation_split=validation_split,
            steps_per_epoch=steps_per_epoch,
            shuffle=shuffle,
            validation_data=validation_data,
            validation_steps=validation_steps,
            distribution_strategy=strategy)

        do_validation = (validation_adapter is not None)

        if not steps_per_epoch:
            steps_per_epoch = training_data_adapter.get_size()

        # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
        training_context = TrainingContext()

        initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
            initial_epoch, ModeKeys.TRAIN)

        _update_sample_weight_mode(model, ModeKeys.TRAIN,
                                   training_data_adapter)
        training_function = _make_execution_function(model, ModeKeys.TRAIN)

        training_data_iter = None
        # Only recreate iterator when the data has a fixed length, which will be
        # fully consumed every epoch, or has a unknown length (dataset, generator)
        # and will be fully consumed (steps_per_epoch is None)
        recreate_training_iterator = (training_data_adapter.get_size()
                                      is not None or steps_per_epoch is None)

        if do_validation:
            if not validation_steps:
                validation_steps = validation_adapter.get_size()
            eval_function = _make_execution_function(model, ModeKeys.TEST)
            eval_data_iter = None
            recreate_eval_iterator = (validation_adapter.get_size() is not None
                                      or validation_steps is None)

        callbacks = cbks.configure_callbacks(
            callbacks,
            model,
            do_validation=do_validation,
            batch_size=batch_size,
            epochs=epochs,
            steps_per_epoch=steps_per_epoch,
            samples=None,
            verbose=0,  # Handle ProgBarLogger separately in this loop.
            mode=ModeKeys.TRAIN)

        with training_context.on_start(model, callbacks, verbose,
                                       ModeKeys.TRAIN):
            # TODO(scottzhu): Handle TPUStrategy training loop
            for epoch in range(initial_epoch, epochs):
                if training_context.callbacks.model.stop_training:
                    break

                # Training
                with training_context.on_epoch(epoch,
                                               ModeKeys.TRAIN) as epoch_logs:
                    model.reset_metrics()
                    if training_data_iter is None or recreate_training_iterator:
                        training_data_iter = _create_dataset_iterator(
                            strategy, training_data_adapter.get_dataset())

                    training_result = run_one_epoch(
                        model,
                        training_data_iter,
                        training_function,
                        dataset_size=training_data_adapter.get_size(),
                        strategy=strategy,
                        steps_per_epoch=steps_per_epoch,
                        mode=ModeKeys.TRAIN,
                        training_context=training_context,
                        current_epoch=epoch)
                    cbks.make_logs(model, epoch_logs, training_result,
                                   ModeKeys.TRAIN)

                    # Evaluation
                    if (do_validation and training_utils.should_run_validation(
                            validation_freq, epoch)
                            and not callbacks.model.stop_training):
                        if eval_data_iter is None or recreate_eval_iterator:
                            eval_data_iter = _create_dataset_iterator(
                                strategy, validation_adapter.get_dataset())
                        eval_context = TrainingContext()
                        with eval_context.on_start(model,
                                                   callbacks,
                                                   verbose=0,
                                                   mode=ModeKeys.TEST):
                            with eval_context.on_epoch(epoch, ModeKeys.TEST):
                                model.reset_metrics()
                                eval_result = run_one_epoch(
                                    model,
                                    eval_data_iter,
                                    eval_function,
                                    dataset_size=validation_adapter.get_size(),
                                    strategy=strategy,
                                    steps_per_epoch=validation_steps,
                                    mode=ModeKeys.TEST,
                                    training_context=eval_context,
                                    current_epoch=epochs)
                                cbks.make_logs(model,
                                               epoch_logs,
                                               eval_result,
                                               ModeKeys.TRAIN,
                                               prefix='val_')

        if strategy:
            scope.__exit__(None, None, None)

        return model.history
コード例 #8
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU tf.distribute.Strategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy,
                                out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is not None:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    raise ValueError('Number of steps could not be infered from the data, '
                     'please pass the steps_per_epoch argument.')

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  steps_to_run = ([current_strategy.extended.steps_per_run] *
                  (steps_per_epoch //
                   current_strategy.extended.steps_per_run))
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)
  target_steps = len(steps_to_run)

  callbacks._call_begin_hook(mode)

  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)

  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step]
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
コード例 #9
0
def model_iteration(model,
                    inputs,
                    targets=None,
                    sample_weights=None,
                    batch_size=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    val_inputs=None,
                    val_targets=None,
                    val_sample_weights=None,
                    shuffle=True,
                    initial_epoch=0,
                    steps_per_epoch=None,
                    validation_steps=None,
                    validation_freq=1,
                    mode=ModeKeys.TRAIN,
                    validation_in_fit=False,
                    prepared_feed_values_from_dataset=False,
                    steps_name='steps',
                    **kwargs):
  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      inputs: Either a list or dictionary of arrays, or a dataset instance.
      targets: List/dictionary of input arrays.
      sample_weights: Optional list of sample weight arrays.
      batch_size: Integer batch size or None if unknown.
      epochs: Number of times to iterate over the data
      verbose: Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
      val_targets: List/dictionary of target arrays.
      val_sample_weights: Optional list of sample weight arrays.
      shuffle: Whether to shuffle the data at the beginning of each epoch
        concatenation of list the display names of the outputs of `f` and the
        list of display names of the outputs of `f_val`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run)
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      validation_steps: Number of steps to run validation for (only if doing
        validation from data tensors). Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      validation_in_fit: if true, then this method is invoked from within
        training iteration (for validation). In the case where `val_inputs` is a
        dataset, this flag indicates that its iterator and feed values are
        already created so should properly reuse resources.
      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
        tensors returned from `_prepare_feed_values` call on the validation
        dataset, so do not call it again on `inputs`. Should only be used for
        inline validation (i.e., only if `validation_in_fit` is also True).
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
  # Backwards compatibility.
  if 'steps' in kwargs:
    steps_per_epoch = kwargs.pop('steps')
  if kwargs:
    raise TypeError('Unknown arguments: %s' % (kwargs,))

  # In case we were passed a dataset, we extract symbolic tensors from it.
  reset_dataset_after_each_epoch = False
  input_iterator = None
  is_dataset = isinstance(inputs,
                          (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
  # TODO(fchollet): consider moving `steps_per_epoch` inference to
  # _standardize_user_data and set reset_dataset_after_each_epoch as an
  # attribute on the dataset instance.
  if is_dataset:
    if steps_per_epoch is None:
      reset_dataset_after_each_epoch = True
      steps_per_epoch = training_utils.infer_steps_for_dataset(
          inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
    input_iterator = _get_iterator(inputs, model._distribution_strategy)

  if mode == ModeKeys.TRAIN:
    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)

  # Enter DistributionStrategy scope.
  if model._distribution_strategy:
    scope = distributed_training_utils.distributed_scope(
        strategy=model._distribution_strategy,
        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
    scope.__enter__()

  # Get step function and loop type.
  f = _make_execution_function(model, mode)
  use_steps = is_dataset or steps_per_epoch is not None
  do_validation = val_inputs is not None

  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
  inputs, targets, sample_weights = training_utils. \
      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))

  # Prepare input data.
  inputs = input_iterator or inputs
  if validation_in_fit and prepared_feed_values_from_dataset:
    # When invoking validation in training loop, avoid creating iterator and
    # list of feed values for the same validation dataset multiple times (which
    # essentially would call `iterator.get_next()` that slows down execution and
    # leads to OOM errors eventually.
    ins = inputs
  else:
    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
  if not is_dataset:
    num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                     steps_per_epoch)
  else:
    num_samples_or_steps = steps_per_epoch

  # Prepare validation data. Hold references to the iterator and the input list
  # to properly reinitialize and reuse in multiple validation passes.
  val_iterator = None
  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
    if validation_steps is None:
      # Because we pass an iterator feed instead of a Dataset to the eval
      # model_iteration() call, it will not trigger the dataset-input path
      # that determines the number of steps required. To avoid this issue,
      # set validation_steps here if validation_steps is None.
      validation_steps = training_utils.infer_steps_for_dataset(
          val_inputs,
          validation_steps,
          epochs=epochs,
          steps_name='validation_steps')
    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
    val_inputs = _prepare_feed_values(
        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)

  # Configure callbacks.
  count_mode = 'steps' if use_steps else 'samples'
  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      batch_size=batch_size,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      samples=num_samples_or_steps,
      verbose=0,  # Handle ProgBarLogger separately in this loop.
      mode=mode)
  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
  progbar = training_utils.get_progbar(model, count_mode)
  progbar.params = callbacks.params
  progbar.params['verbose'] = verbose

  # Find beforehand arrays that need sparse-to-dense conversion.
  if issparse is not None and not use_steps:
    indices_for_conversion_to_dense = []
    feed = _get_model_feed(model, mode)
    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
      if issparse(input_data) and not K.is_sparse(feed_tensor):
        indices_for_conversion_to_dense.append(i)

  # Select aggregation method.
  if mode == ModeKeys.PREDICT:
    aggregator = training_utils.OutputsAggregator(use_steps,
                                                  num_samples_or_steps)
  else:
    aggregator = training_utils.MetricsAggregator(use_steps,
                                                  num_samples_or_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks.model.stop_training = False
  callbacks._call_begin_hook(mode)
  progbar.on_train_begin()

  for epoch in range(initial_epoch, epochs):
    if callbacks.model.stop_training:
      break

    # Setup work for each epoch
    epoch_logs = {}
    model.reset_metrics()
    if mode == ModeKeys.TRAIN:
      callbacks.on_epoch_begin(epoch, epoch_logs)
    progbar.on_epoch_begin(epoch, epoch_logs)

    if use_steps:
      # Step-wise loop.
      if steps_per_epoch is None:
        # Loop over dataset until `OutOfRangeError` is raised.
        target_steps = np.inf
      else:
        # Loop over dataset for the specified number of steps.
        target_steps = steps_per_epoch

      step = 0
      while step < target_steps:
        batch_logs = {'batch': step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
        progbar.on_batch_begin(step, batch_logs)

        # Get outputs.
        try:
          # `ins` can be callable in DistributionStrategy + eager case.
          actual_inputs = ins() if callable(ins) else ins
          batch_outs = f(actual_inputs)
        except errors.OutOfRangeError:
          if is_dataset:
            # The dataset passed by the user ran out of batches.
            # Now we know the cardinality of the dataset.
            # If steps_per_epoch was specified, then running out of data is
            # unexpected, so we stop training and inform the user.
            if steps_per_epoch:
              callbacks.model.stop_training = True
              logging.warning(
                  'Your dataset ran out of data; interrupting training. '
                  'Make sure that your dataset can generate at least '
                  '`%s * epochs` batches (in this case, %d batches). '
                  'You may need to use the repeat() function when '
                  'building your dataset.'
                  % (steps_name, steps_per_epoch * epochs))
            elif step > 0:
              steps_per_epoch = step
              aggregator.num_samples_or_steps = steps_per_epoch
              if mode == ModeKeys.TRAIN:
                progbar.params['steps'] = steps_per_epoch
                progbar.progbar.target = steps_per_epoch
          else:
            # We ran out of batches while the user passed an iterator (legacy).
            callbacks.model.stop_training = True
            logging.warning(
                'Your dataset iterator ran out of data; '
                'interrupting training. Make sure that your iterator '
                'can generate at least `%s * epochs` '
                'batches (in this case, %d batches). You may need to'
                'use the repeat() function when building your '
                'dataset.' % (steps_name, steps_per_epoch * epochs))
          break

        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        if model._distribution_strategy:
          batch_outs = distributed_training_utils._per_device_aggregate_batch(
              batch_outs, model, mode)

        # Aggregate results.
        if step == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
        progbar.on_batch_end(step, batch_logs)
        step += 1

        if callbacks.model.stop_training:
          break
    else:
      # Sample-wise loop.
      index_array = np.arange(num_samples_or_steps)
      if shuffle == 'batch':
        index_array = training_utils.batch_shuffle(index_array, batch_size)
      elif shuffle:
        np.random.shuffle(index_array)
      batches = make_batches(num_samples_or_steps, batch_size)

      for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]

        # Slice into a batch.
        try:
          if ins and isinstance(ins[-1], int):
            # Do not slice the training phase flag.
            ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
          else:
            ins_batch = slice_arrays(ins, batch_ids)
        except TypeError:
          raise TypeError('TypeError while preparing batch. '
                          'If using HDF5 input data, '
                          'pass shuffle="batch".')

        # Sparse to dense conversion.
        if issparse is not None:
          for i in indices_for_conversion_to_dense:
            ins_batch[i] = ins_batch[i].toarray()

        # Callbacks batch_begin.
        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
        progbar.on_batch_begin(batch_index, batch_logs)

        # Get outputs.
        batch_outs = f(ins_batch)
        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        # Aggregate results.
        if batch_index == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs, batch_start, batch_end)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
        progbar.on_batch_end(batch_index, batch_logs)

        if callbacks.model.stop_training:
          break

    aggregator.finalize()
    results = aggregator.results
    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
    if len(results) == 1:
      results = results[0]

    # Run the test loop every `validation_freq` epochs during training.
    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch) and
        not callbacks.model.stop_training):

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_results = model_iteration(
          model,
          val_inputs,
          targets=val_targets,
          sample_weights=val_sample_weights,
          batch_size=batch_size,
          steps_per_epoch=validation_steps,
          callbacks=callbacks,
          verbose=0,
          mode=ModeKeys.TEST,
          validation_in_fit=True,
          prepared_feed_values_from_dataset=(val_iterator is not None),
          steps_name='validation_steps')
      if not isinstance(val_results, list):
        val_results = [val_results]
      epoch_logs = cbks.make_logs(
          model, epoch_logs, val_results, mode, prefix='val_')
      if val_iterator and epoch < epochs - 1:
        _reinitialize_iterator(val_iterator, model._distribution_strategy)

    if mode == ModeKeys.TRAIN:
      # Epochs only apply to `fit`.
      callbacks.on_epoch_end(epoch, epoch_logs)
    progbar.on_epoch_end(epoch, epoch_logs)

    # Reinitialize dataset iterator for the next epoch.
    if reset_dataset_after_each_epoch and epoch < epochs - 1:
      _reinitialize_iterator(input_iterator, model._distribution_strategy)

  callbacks._call_end_hook(mode)

  if model._distribution_strategy:
    if model._compile_distribution:
      # TODO(priyag, psv): Copy back metrics to the original model as well?
      distributed_training_utils._copy_weights_to_original_model(model, mode)
    scope.__exit__(None, None, None)

  if mode == ModeKeys.TRAIN:
    return model.history
  return results
コード例 #10
0
def model_iteration(model,
                    inputs,
                    targets=None,
                    sample_weights=None,
                    batch_size=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    val_inputs=None,
                    val_targets=None,
                    val_sample_weights=None,
                    shuffle=True,
                    initial_epoch=0,
                    steps_per_epoch=None,
                    validation_steps=None,
                    validation_freq=1,
                    mode=ModeKeys.TRAIN,
                    validation_in_fit=False,
                    prepared_feed_values_from_dataset=False,
                    steps_name='steps',
                    **kwargs):
    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      inputs: Either a list or dictionary of arrays, or a dataset instance.
      targets: List/dictionary of input arrays.
      sample_weights: Optional list of sample weight arrays.
      batch_size: Integer batch size or None if unknown.
      epochs: Number of times to iterate over the data
      verbose: 0, 1, or 2. Verbosity mode.
        0 = silent, 1 = progress bar, 2 = one line per epoch.
        Note that the progress bar is not particularly useful when
        logged to a file, so verbose=2 is recommended when not running
        interactively (eg, in a production environment).
      callbacks: List of callbacks to be called during training
      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
      val_targets: List/dictionary of target arrays.
      val_sample_weights: Optional list of sample weight arrays.
      shuffle: Whether to shuffle the data at the beginning of each epoch
        concatenation of list the display names of the outputs of `f` and the
        list of display names of the outputs of `f_val`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run)
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      validation_steps: Number of steps to run validation for (only if doing
        validation from data tensors). Ignored with the default value of
        `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections_abc.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      validation_in_fit: if true, then this method is invoked from within
        training iteration (for validation). In the case where `val_inputs` is
        a dataset, this flag indicates that its iterator and feed values are
        already created so should properly reuse resources.
      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
        tensors returned from `_prepare_feed_values` call on the validation
        dataset, so do not call it again on `inputs`. Should only be used for
        inline validation (i.e., only if `validation_in_fit` is also True).
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
    # Backwards compatibility.
    if 'steps' in kwargs:
        steps_per_epoch = kwargs.pop('steps')
    if kwargs:
        raise TypeError('Unknown arguments: %s' % (kwargs, ))

    # In case we were passed a dataset, we extract symbolic tensors from it.
    reset_dataset_after_each_epoch = False
    input_iterator = None
    is_dataset = isinstance(inputs,
                            (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
    # TODO(fchollet): consider moving `steps_per_epoch` inference to
    # _standardize_user_data and set reset_dataset_after_each_epoch as an
    # attribute on the dataset instance.
    if is_dataset:
        if steps_per_epoch is None:
            reset_dataset_after_each_epoch = True
            steps_per_epoch = training_utils.infer_steps_for_dataset(
                model,
                inputs,
                steps_per_epoch,
                epochs=epochs,
                steps_name=steps_name)
        input_iterator = _get_iterator(inputs, model._distribution_strategy)

    # Enter tf.distribute.Strategy scope.
    if model._distribution_strategy:
        scope = distributed_training_utils.distributed_scope(
            strategy=model._distribution_strategy,
            learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
        scope.__enter__()

    use_steps = is_dataset or steps_per_epoch is not None
    do_validation = val_inputs is not None

    # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
    inputs, targets, sample_weights = training_utils. \
        convert_eager_tensors_to_numpy((inputs, targets, sample_weights))

    # Prepare input data.
    inputs = input_iterator or inputs
    if validation_in_fit and prepared_feed_values_from_dataset:
        # When invoking validation in training loop, avoid creating iterator and
        # list of feed values for the same validation dataset multiple times (which
        # essentially would call `iterator.get_next()` that slows down execution and
        # leads to OOM errors eventually.
        ins = inputs
    else:
        ins = _prepare_feed_values(model, inputs, targets, sample_weights,
                                   mode)
        # `ins` is a function when a distribute strategy is used in Eager mode.  In
        # that case `is_dataset` is True.  The code branches that have requirements
        # about the type of `ins` do not trigger in the distributed case.

    if not is_dataset:
        num_samples_or_steps = _get_num_samples_or_steps(
            ins, batch_size, steps_per_epoch)
    else:
        num_samples_or_steps = steps_per_epoch

    # Update sample_weight_mode of the model if sample_weights is specified by the
    # user. We need to call this function after we have a handle on the inputs
    # (both numpy arrays and datasets) in order to determine if the user has
    # specified sample_weights.
    _update_sample_weight_mode(model, mode, ins)

    # Get step function and loop type. As part of building the execution
    # function we recompile the metrics based on the updated
    # sample_weight_mode value.
    f = _make_execution_function(model, mode)

    # Prepare validation data. Hold references to the iterator and the input list
    # to properly reinitialize and reuse in multiple validation passes.
    val_iterator = None
    if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
        if validation_steps is None:
            # Because we pass an iterator feed instead of a Dataset to the eval
            # model_iteration() call, it will not trigger the dataset-input path
            # that determines the number of steps required. To avoid this issue,
            # set validation_steps here if validation_steps is None.
            validation_steps = training_utils.infer_steps_for_dataset(
                model,
                val_inputs,
                validation_steps,
                epochs=epochs,
                steps_name='validation_steps')
        val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
        val_inputs = _prepare_feed_values(model, val_iterator, val_targets,
                                          val_sample_weights, ModeKeys.TEST)
        # Get num steps for printing.
        val_samples_or_steps = validation_steps
    else:
        # Get num samples for printing.
        val_samples_or_steps = val_inputs and nest.flatten(
            val_inputs)[0].shape[0] or None

    if mode == ModeKeys.TRAIN and verbose:
        _print_train_info(num_samples_or_steps, val_samples_or_steps,
                          is_dataset)

    # Configure callbacks.
    count_mode = 'steps' if use_steps else 'samples'
    callbacks = cbks.configure_callbacks(
        callbacks,
        model,
        do_validation=do_validation,
        batch_size=batch_size,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        samples=num_samples_or_steps,
        verbose=0,  # Handle ProgBarLogger separately in this loop.
        mode=mode)
    # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
    progbar = training_utils.get_progbar(model, count_mode,
                                         mode != ModeKeys.PREDICT)
    progbar.params = callbacks.params
    progbar.params['verbose'] = verbose

    # Find beforehand arrays that need sparse-to-dense conversion.
    if issparse is not None and not use_steps:
        indices_for_conversion_to_dense = []
        feed = _get_model_feed(model, mode)
        for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
            if issparse(input_data) and not K.is_sparse(feed_tensor):
                indices_for_conversion_to_dense.append(i)

    # Select aggregation method.
    if mode == ModeKeys.PREDICT:
        aggregator = training_utils.OutputsAggregator(
            use_steps,
            num_samples=None if steps_per_epoch else num_samples_or_steps,
            steps=steps_per_epoch)
    else:
        aggregator = training_utils.MetricsAggregator(
            use_steps,
            num_samples=None if steps_per_epoch else num_samples_or_steps,
            steps=steps_per_epoch)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    callbacks.model.stop_training = False
    callbacks._call_begin_hook(mode)
    progbar.on_train_begin()

    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
        initial_epoch, mode)

    for epoch in range(initial_epoch, epochs):
        if callbacks.model.stop_training:
            break

        # Setup work for each epoch
        epoch_logs = {}
        if mode != ModeKeys.PREDICT:
            # Collecting and resetting metrics has non-zero cost and will needlessly
            # slow down model.predict.
            model.reset_metrics()
        if mode == ModeKeys.TRAIN:
            callbacks.on_epoch_begin(epoch, epoch_logs)
        progbar.on_epoch_begin(epoch, epoch_logs)

        if use_steps:
            # Step-wise loop.
            if steps_per_epoch is None:
                # Loop over dataset until `OutOfRangeError` is raised.
                target_steps = np.inf
            else:
                # Loop over dataset for the specified number of steps.
                target_steps = steps_per_epoch

            step = 0
            while step < target_steps:
                batch_logs = {'batch': step, 'size': 1}
                callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
                progbar.on_batch_begin(step, batch_logs)

                # Get outputs.
                try:
                    # `ins` can be callable in tf.distribute.Strategy + eager case.
                    if not callable(ins) or (
                            model._distribution_strategy
                            and not distributed_training_utils.
                            is_distributing_by_cloning(model)):
                        actual_inputs = ins
                    else:
                        actual_inputs = ins()
                    batch_outs = f(actual_inputs)
                except errors.OutOfRangeError:
                    if is_dataset:
                        # The dataset passed by the user ran out of batches.
                        # Now we know the cardinality of the dataset.
                        # If steps_per_epoch was specified, then running out of data is
                        # unexpected, so we stop training and inform the user.
                        if steps_per_epoch:
                            callbacks.model.stop_training = True
                            logging.warning(
                                'Your dataset ran out of data; interrupting training. '
                                'Make sure that your dataset can generate at least '
                                '`%s * epochs` batches (in this case, %d batches). '
                                'You may need to use the repeat() function when '
                                'building your dataset.' %
                                (steps_name, steps_per_epoch * epochs))
                        elif step > 0:
                            steps_per_epoch = step
                            aggregator.steps = steps_per_epoch
                            if mode == ModeKeys.TRAIN:
                                progbar.params['steps'] = steps_per_epoch
                                progbar.progbar.target = steps_per_epoch
                    else:
                        # We ran out of batches while the user passed an iterator (legacy).
                        callbacks.model.stop_training = True
                        logging.warning(
                            'Your dataset iterator ran out of data; '
                            'interrupting training. Make sure that your iterator '
                            'can generate at least `%s * epochs` '
                            'batches (in this case, %d batches). You may need to'
                            'use the repeat() function when building your '
                            'dataset.' %
                            (steps_name, steps_per_epoch * epochs))
                    break

                if not isinstance(batch_outs, list):
                    batch_outs = [batch_outs]

                if model._distribution_strategy:
                    batch_outs = distributed_training_utils._per_replica_aggregate_batch(
                        model._distribution_strategy, batch_outs, model, mode)

                # Aggregate results.
                if step == 0:
                    aggregator.create(batch_outs)
                aggregator.aggregate(batch_outs)

                # Callbacks batch end.
                batch_logs = cbks.make_logs(model, batch_logs, batch_outs,
                                            mode)
                callbacks._call_batch_hook(mode, 'end', step, batch_logs)
                progbar.on_batch_end(step, batch_logs)
                step += 1

                if callbacks.model.stop_training:
                    break
        else:
            # Sample-wise loop.
            index_array = np.arange(num_samples_or_steps)
            if shuffle == 'batch':
                index_array = training_utils.batch_shuffle(
                    index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)
            batches = make_batches(num_samples_or_steps, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):
                batch_ids = index_array[batch_start:batch_end]
                # Slice into a batch.
                if len(batches) == 1:
                    # If we only have one batch, do not slice. This takes care of
                    # composite tensors in non-Dataset modes; we currently don't support
                    # slicing them.
                    # TODO(b/133517906): Add slicing support.
                    ins_batch = ins
                else:
                    try:
                        if ins and isinstance(ins[-1], int):
                            # Do not slice the training phase flag.
                            ins_batch = slice_arrays(ins[:-1],
                                                     batch_ids) + [ins[-1]]
                        else:
                            ins_batch = slice_arrays(ins, batch_ids)
                    except TypeError:
                        raise TypeError('TypeError while preparing batch. '
                                        'If using HDF5 input data, '
                                        'pass shuffle="batch".')

                # Sparse to dense conversion.
                if issparse is not None:
                    for i in indices_for_conversion_to_dense:
                        ins_batch[i] = ins_batch[i].toarray()

                # Callbacks batch_begin.
                batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
                callbacks._call_batch_hook(mode, 'begin', batch_index,
                                           batch_logs)
                progbar.on_batch_begin(batch_index, batch_logs)

                # Get outputs.
                batch_outs = f(ins_batch)
                if not isinstance(batch_outs, list):
                    batch_outs = [batch_outs]

                # Aggregate results.
                if batch_index == 0:
                    aggregator.create(batch_outs)
                aggregator.aggregate(batch_outs, batch_start, batch_end)

                # Callbacks batch end.
                batch_logs = cbks.make_logs(model, batch_logs, batch_outs,
                                            mode)
                callbacks._call_batch_hook(mode, 'end', batch_index,
                                           batch_logs)
                progbar.on_batch_end(batch_index, batch_logs)

                if callbacks.model.stop_training:
                    break

        aggregator.finalize()
        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        # Run the test loop every `validation_freq` epochs during training.
        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch) and not callbacks.model.stop_training):

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                distributed_training_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_results = model_iteration(
                model,
                val_inputs,
                targets=val_targets,
                sample_weights=val_sample_weights,
                batch_size=batch_size,
                steps_per_epoch=validation_steps,
                callbacks=callbacks,
                verbose=0,
                mode=ModeKeys.TEST,
                validation_in_fit=True,
                prepared_feed_values_from_dataset=(val_iterator is not None),
                steps_name='validation_steps')
            if not isinstance(val_results, list):
                val_results = [val_results]
            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')
            if val_iterator and epoch < epochs - 1:
                _reinitialize_iterator(val_iterator,
                                       model._distribution_strategy)

        if mode == ModeKeys.TRAIN:
            # Epochs only apply to `fit`.
            callbacks.on_epoch_end(epoch, epoch_logs)
        progbar.on_epoch_end(epoch, epoch_logs)

        # Reinitialize dataset iterator for the next epoch.
        if reset_dataset_after_each_epoch and epoch < epochs - 1:
            _reinitialize_iterator(input_iterator,
                                   model._distribution_strategy)

    callbacks._call_end_hook(mode)

    if model._distribution_strategy:
        if model._compile_distribution:
            # TODO(priyag, psv): Copy back metrics to the original model as well?
            distributed_training_utils._copy_weights_to_original_model(
                model, mode)
        scope.__exit__(None, None, None)

    if mode == ModeKeys.TRAIN:
        return model.history
    return results
コード例 #11
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
  if (current_strategy.extended.steps_per_run != 1 and
      steps_per_epoch is None):
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model with TPUStrategy when '
                     '`steps_per_run` != 1 .')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  use_steps = steps_per_epoch is not None
  if use_steps:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    iteration_value = current_strategy.extended.steps_per_run

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  if use_steps:
    steps_to_run = ([current_strategy.extended.steps_per_run] *
                    (steps_per_epoch //
                     current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
      steps_to_run.append(
          steps_per_epoch % current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)
  else:
    target_steps = np.inf

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step] if use_steps else 1
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        if use_steps:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches).' %
                          steps_per_epoch * epochs)
        else:
          target_steps = current_step
          logging.info('Dataset iterator ran out of data. Inferring the '
                       'value of `steps_per_epoch` as %s  .' % target_steps)
          distributed_training_utils.initialize_iterator(iterator,
                                                         current_strategy)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
コード例 #12
0
def model_iteration(model,
                    data,
                    steps_per_epoch=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    validation_data=None,
                    validation_steps=None,
                    validation_freq=1,
                    train_class_weight=None,
                    val_class_weight=None,
                    max_queue_size=10,
                    workers=1,
                    use_multiprocessing=False,
                    shuffle=False,
                    initial_epoch=0,
                    mode=ModeKeys.TRAIN,
                    batch_size=None,
                    steps_name='steps',
                    **kwargs):

    if 'steps' in kwargs:
        steps_per_epoch = kwargs['steps']

    # Determine the number of steps per epoch and whether we should reset the
    # dataset at the end of each epoch.
    reset_dataset_after_each_epoch = False
    original_dataset = None
    is_dataset = isinstance(data,
                            (dataset_ops.DatasetV2, dataset_ops.DatasetV1))
    if is_dataset:
        original_dataset = data
        if steps_per_epoch is None:
            reset_dataset_after_each_epoch = True
            steps_per_epoch = training_utils.infer_steps_for_dataset(
                data, steps_per_epoch, epochs=epochs, steps_name=steps_name)

    # Convert to a format that supports `next(generator)`.
    generator, steps_per_epoch = convert_to_generator_like(
        data,
        steps_per_epoch=steps_per_epoch,
        batch_size=batch_size,
        epochs=epochs - initial_epoch,
        shuffle=shuffle)

    do_validation = validation_data is not None
    is_sequence = isinstance(generator, data_utils.Sequence)
    _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                        steps_per_epoch, validation_data, validation_steps,
                        mode, kwargs)

    # print(train_class_weight, 'before make execution')
    ######################################################################
    batch_function = _make_execution_function(
        model,
        mode,
        train_class_weight=train_class_weight,
        val_class_weight=val_class_weight)
    ######################################################################

    # Create the queue for the generator.
    enqueuer = None
    if not is_dataset:
        generator, enqueuer = _make_enqueued_generator(
            generator,
            workers=workers,
            use_multiprocessing=use_multiprocessing,
            max_queue_size=max_queue_size,
            shuffle=shuffle)

    num_samples_or_steps, use_steps = _get_num_samples_or_steps(
        data, steps_per_epoch)

    count_mode = 'steps' if use_steps else 'samples'
    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         batch_size=batch_size,
                                         samples=num_samples_or_steps,
                                         verbose=verbose,
                                         count_mode=count_mode,
                                         mode=mode)

    if mode == ModeKeys.PREDICT:
        aggregator = training_utils.OutputsAggregator(True,
                                                      steps=steps_per_epoch)
    else:
        aggregator = training_utils.MetricsAggregator(True,
                                                      steps=steps_per_epoch)

    should_set_learning_phase = context.executing_eagerly(
    ) and model.run_eagerly
    if should_set_learning_phase:
        learning_phase_scope = backend.eager_learning_phase_scope(
            1 if mode == ModeKeys.TRAIN else 0)
        learning_phase_scope.__enter__()

    callbacks.model.stop_training = False
    callbacks._call_begin_hook(mode)

    print(initial_epoch, mode)
    # TODO: mode is a bug?
    # https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/keras/engine/training.py
    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch)

    for epoch in range(initial_epoch, epochs):
        if callbacks.model.stop_training:
            break

        # Setup work for each epoch.
        model.reset_metrics()
        epoch_logs = {}
        if mode == ModeKeys.TRAIN:
            callbacks.on_epoch_begin(epoch, epoch_logs)

        if steps_per_epoch is None:
            # Loop over dataset until `OutOfRangeError` is raised.
            target_steps = np.inf
        else:
            # Loop over dataset for the specified number of steps.
            target_steps = steps_per_epoch

        step = 0
        while step < target_steps:
            batch_data = _get_next_batch(generator)
            if batch_data is None:
                if is_dataset:
                    # The dataset passed by the user ran out of batches.
                    # Now we know the cardinality of the dataset.
                    # If steps_per_epoch was specified, then running out of data is
                    # unexpected, so we stop training and inform the user.
                    if steps_per_epoch:
                        callbacks.model.stop_training = True
                        logging.warning(
                            'Your dataset ran out of data; interrupting training. '
                            'Make sure that your dataset can generate at least '
                            '`%s * epochs` batches (in this case, %d batches). '
                            'You may need to use the repeat() function when '
                            'building your dataset.' %
                            (steps_name, steps_per_epoch * epochs))
                    elif step > 0:
                        steps_per_epoch = step
                        aggregator.steps = steps_per_epoch
                else:
                    # We ran out of batches while the user passed an iterator (legacy).
                    callbacks.model.stop_training = True
                    logging.warning(
                        'Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your iterator '
                        'can generate at least `%s * epochs` '
                        'batches (in this case, %d batches). You may need to'
                        'use the repeat() function when building your '
                        'dataset.' % (steps_name, steps_per_epoch * epochs))
                break

            # `batch_size` used for validation data if validation
            # data is NumPy/EagerTensors.
            batch_size = int(nest.flatten(batch_data)[0].shape[0])

            # Callbacks batch begin.
            batch_logs = {'batch': step, 'size': batch_size}
            callbacks._call_batch_hook(mode, 'begin', step, batch_logs)

            is_deferred = not model._is_compiled
            ######################################################
            batch_outs = batch_function(*batch_data)
            ######################################################
            if not isinstance(batch_outs, list):
                batch_outs = [batch_outs]

            if step == 0:
                aggregator.create(batch_outs)

                if is_deferred:
                    # Set callbacks params. We do this here when model is compiled only
                    # in the first iteration of this loop (deferred build scenario).
                    cbks.set_callback_parameters(
                        callbacks,
                        model,
                        do_validation=do_validation,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=steps_per_epoch,
                        samples=num_samples_or_steps,
                        verbose=verbose,
                        mode=mode)

            # Aggregate results.
            aggregator.aggregate(batch_outs)

            # Callbacks batch end.
            batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
            callbacks._call_batch_hook(mode, 'end', step, batch_logs)
            step += 1

            if callbacks.model.stop_training:
                break

        aggregator.finalize()
        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        # Run the test loop every epoch during training.
        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch) and not callbacks.model.stop_training):
            ############################################################################
            val_results = model_iteration(
                model,
                validation_data,
                steps_per_epoch=validation_steps,
                batch_size=batch_size,
                val_class_weight=val_class_weight,  ######## HACK
                workers=workers,
                use_multiprocessing=use_multiprocessing,
                max_queue_size=max_queue_size,
                callbacks=callbacks,
                verbose=0,
                mode=ModeKeys.TEST,
                steps_name='validation_steps')
            ############################################################################

            if not isinstance(val_results, list):
                val_results = [val_results]
            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')

        if mode == ModeKeys.TRAIN:
            # Epochs only apply to `fit`.
            callbacks.on_epoch_end(epoch, epoch_logs)

        # Recreate dataset iterator for the next epoch.
        if reset_dataset_after_each_epoch and epoch < epochs - 1:
            generator = dataset_ops.make_one_shot_iterator(original_dataset)

    callbacks._call_end_hook(mode)

    if enqueuer is not None:
        enqueuer.stop()

    if should_set_learning_phase:
        learning_phase_scope.__exit__(None, None, None)

    if mode == ModeKeys.TRAIN:
        return model.history
    return results
コード例 #13
0
ファイル: training.py プロジェクト: vlad-user/deep-tempering
def model_iteration(
    model,
    inputs,
    targets=None,
    batch_size=2,
    epochs=1,
    verbose=1,
    callbacks=None,
    validation_data=None,
    exchange_data=None,
    shuffle=False,
    validation_split=0.0,
    exchange_split=0.0,
    validation_freq=1,
    random_data_split_state=0,
    mode=ModeKeys.TRAIN,
):
    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
  Args:
    model: `EnsembleModel` instance.
    inputs: Either a list or dictionary of arrays, or a dataset instance.
    targets: List/dictionary of input arrays.
    batch_size: Integer batch size or None if unknown.
    epochs: Number of times to iterate over the data
    verbose: 0, 1, or 2. Verbosity mode.
      0 = silent, 1 = progress bar, 2 = one line per epoch.
      Note that the progress bar is not particularly useful when
      logged to a file, so verbose=2 is recommended when not running
      interactively (eg, in a production environment).
    callbacks: List of callbacks to be called during training
    validation_data:
    shuffle: Whether to shuffle the data at the beginning of each epoch
      concatenation of list the display names of the outputs of `f` and the
      list of display names of the outputs of `f_val`.
    validation_freq: Only relevant if validation data is provided. Integer or
      `collections_abc.Container` instance (e.g. list, tuple, etc.). If an
      integer, specifies how many training epochs to run before a new
      validation run is performed, e.g. `validation_freq=2` runs
      validation every 2 epochs. If a Container, specifies the epochs on
      which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
      validation at the end of the 1st, 2nd, and 10th epochs.
    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.


  Returns:
    - In TRAIN mode: `History` object.
    - In TEST mode: Evaluation metrics.
    - In PREDICT mode: Outputs of the Model called on inputs.
  Raises:
    ValueError: in case of invalid arguments.
  """
    datasets = training_utils.prepare_data_iterables(
        inputs,
        targets,
        validation_split=validation_split,
        validation_data=validation_data,
        exchange_data=exchange_data,
        batch_size=batch_size,
        shuffle=shuffle,
        exchange_split=exchange_split,
        random_state=random_data_split_state)

    val_samples_or_steps = None
    exchange_data = None
    if (isinstance(datasets, training_utils.DataIterable)
            or len(datasets) == 3 and datasets[1] is None):
        # TEST, PREDICT modes or TRAIN mode without validation data
        do_validation = False
        if isinstance(datasets, training_utils.DataIterable):
            train_dataset = datasets
        else:
            train_dataset = datasets[0]
            exchange_data = datasets[2]
    else:
        # TRAIN mode with validation data
        do_validation = True
        train_dataset, test_dataset, exchange_data = datasets
        val_samples_or_steps = len(test_dataset)

    num_samples_or_steps = len(train_dataset)

    f = _make_execution_function(model,
                                 mode)  #whether to train, test or predict

    if mode == ModeKeys.PREDICT:
        aggregator = keras_train_utils.OutputsAggregator(
            use_steps=False, num_samples=num_samples_or_steps)
    else:
        aggregator = training_utils.MetricsAggregator(
            n_replicas=model.n_replicas, num_samples=len(train_dataset))

    if mode == ModeKeys.TRAIN and verbose:
        _print_train_info(num_samples_or_steps,
                          val_samples_or_steps,
                          replicas=model.n_replicas,
                          increment='samples')

    # Configure callbacks
    callbacks = cbks.configure_callbacks(
        callbacks,
        model,
        do_validation=do_validation,
        batch_size=batch_size,
        samples=num_samples_or_steps,
        epochs=epochs,
        verbose=verbose,
        mode=mode,
        exchange_data=exchange_data,
    )

    callbacks.model.stop_training = False
    callbacks._call_begin_hook(mode)

    for epoch in range(epochs):
        if callbacks.model.stop_training:
            break

        # Setup work for each epoch
        epoch_logs = {}
        if mode != ModeKeys.PREDICT:
            # Collecting and resetting metrics has non-zero cost and will needlessly
            # slow down model.predict.
            model.reset_metrics()

        callbacks._call_epoch_hook(mode, 'begin', epoch, epoch_logs)

        # batch_start and batch_end are added so we can use the
        # Keras' aggregator. It accepts it as args to compute
        # weighted batch size average of the overall losses.
        batch_start = 0
        for batch_index, batch_data in enumerate(train_dataset):
            # Callbacks batch_begin.
            if mode == ModeKeys.PREDICT:
                x, y = batch_data, None
            else:
                x, y = batch_data

            batch_end = batch_start + x.shape[0]
            batch_logs = {'batch': batch_index, 'size': x.shape[0]}
            callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)

            # Get outputs.
            batch_outs = f(x, y)

            if not isinstance(batch_outs, list):
                batch_outs = [batch_outs]

            # Aggregate results.
            if batch_index == 0:
                aggregator.create(batch_outs)
            aggregator.aggregate(batch_outs, batch_start, batch_end)

            # Callbacks batch end.
            batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
            callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)

            batch_start = batch_end

            if callbacks.model.stop_training:
                break

        aggregator.finalize()

        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        if (do_validation and keras_train_utils.should_run_validation(
                validation_freq, epochs)):
            val_results = model_iteration(model,
                                          test_dataset,
                                          targets=None,
                                          batch_size=batch_size,
                                          callbacks=callbacks,
                                          verbose=0,
                                          mode=ModeKeys.TEST)
            if not isinstance(val_results, list):
                val_results = [val_results]

            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')

        callbacks._call_epoch_hook(mode, 'end', epoch, epoch_logs)

    callbacks._call_end_hook(mode)
    callbacks.on_train_end()  # TODO: move this to CallbackListWrap

    if mode == ModeKeys.TRAIN:
        return model.history
    return results
コード例 #14
0
def fit_generator(model,
                  generator,
                  steps_per_epoch=None,
                  epochs=1,
                  verbose=1,
                  callbacks=None,
                  validation_data=None,
                  validation_steps=None,
                  class_weight=None,
                  max_queue_size=10,
                  workers=1,
                  use_multiprocessing=False,
                  shuffle=True,
                  initial_epoch=0):
    """See docstring for `Model.fit_generator`."""
    epoch = initial_epoch

    do_validation = bool(validation_data)
    if not context.executing_eagerly():
        model._make_train_function()
        if do_validation:
            model._make_test_function()

    is_sequence = isinstance(generator, Sequence)
    if not is_sequence and use_multiprocessing and workers > 1:
        logging.warning(
            UserWarning('Using a generator with `use_multiprocessing=True`'
                        ' and multiple workers may duplicate your data.'
                        ' Please consider using the`keras.utils.Sequence'
                        ' class.'))
    if steps_per_epoch is None:
        if is_sequence:
            steps_per_epoch = len(generator)
        else:
            raise ValueError('`steps_per_epoch=None` is only valid for a'
                             ' generator based on the `keras.utils.Sequence`'
                             ' class. Please specify `steps_per_epoch` or use'
                             ' the `keras.utils.Sequence` class.')

            is_deferred = not model._is_compiled
            batch_outs = batch_function(*batch_data)
            if not isinstance(batch_outs, list):
                batch_outs = [batch_outs]

            if step == 0:
                aggregator.create(batch_outs)

                if is_deferred:
                    # Set callbacks params. We do this here when model is compiled only
                    # in the first iteration of this loop (deferred build scenario).
                    cbks.set_callback_parameters(
                        callbacks,
                        model,
                        do_validation=do_validation,
                        batch_size=batch_size,
                        epochs=epochs,
                        steps_per_epoch=steps_per_epoch,
                        samples=num_samples_or_steps,
                        verbose=verbose,
                        mode=mode)

                    progbar.params = callbacks.params
                    progbar.params['verbose'] = verbose

            # Aggregate results.
            aggregator.aggregate(batch_outs)

            # Callbacks batch end.
            batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
            callbacks._call_batch_hook(mode, 'end', step, batch_logs)
            progbar.on_batch_end(step, batch_logs)
            step += 1

        callbacks.on_train_begin()
        # Construct epoch logs.
        epoch_logs = {}
        while epoch < epochs:
            for m in model.stateful_metric_functions:
                m.reset_states()
            callbacks.on_epoch_begin(epoch)
            steps_done = 0
            batch_index = 0
            while steps_done < steps_per_epoch:
                generator_output = next(output_generator)

                if not hasattr(generator_output, '__len__'):
                    raise ValueError('Output of generator should be '
                                     'a tuple `(x, y, sample_weight)` '
                                     'or `(x, y)`. Found: ' +
                                     str(generator_output))

                if len(generator_output) == 2:
                    x, y = generator_output
                    sample_weight = None
                elif len(generator_output) == 3:
                    x, y, sample_weight = generator_output
                else:
                    raise ValueError('Output of generator should be '
                                     'a tuple `(x, y, sample_weight)` '
                                     'or `(x, y)`. Found: ' +
                                     str(generator_output))
                # build batch logs
                batch_logs = {}
                if isinstance(x, list):
                    batch_size = x[0].shape[0]
                elif isinstance(x, dict):
                    batch_size = list(x.values())[0].shape[0]
                else:
                    batch_size = x.shape[0]
                batch_logs['batch'] = batch_index
                batch_logs['size'] = batch_size
                callbacks.on_batch_begin(batch_index, batch_logs)

                outs = model.train_on_batch(x,
                                            y,
                                            sample_weight=sample_weight,
                                            class_weight=class_weight)

                if not isinstance(outs, list):
                    outs = [outs]
                for l, o in zip(model.metrics_names, outs):
                    batch_logs[l] = o

                callbacks.on_batch_end(batch_index, batch_logs)

                batch_index += 1
                steps_done += 1

                # Epoch finished.
                if steps_done >= steps_per_epoch and do_validation:
                    if val_gen:
                        val_outs = evaluate_generator(
                            model,
                            validation_data,
                            validation_steps,
                            workers=workers,
                            use_multiprocessing=use_multiprocessing,
                            max_queue_size=max_queue_size)
                    else:
                        # No need for try/except because
                        # data has already been validated.
                        val_outs = model.evaluate(
                            val_x,
                            val_y,
                            batch_size=batch_size,
                            sample_weight=val_sample_weights,
                            verbose=0)
                    if not isinstance(val_outs, list):
                        val_outs = [val_outs]
                    # Same labels assumed.
                    for l, o in zip(model.metrics_names, val_outs):
                        epoch_logs['val_' + l] = o

                if callbacks.model.stop_training:
                    break

            callbacks.on_epoch_end(epoch, epoch_logs)
            epoch += 1
            if callbacks.model.stop_training:
                break

        aggregator.finalize()
        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        # Run the test loop every epoch during training.
        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch) and not callbacks.model.stop_training):
            val_results = model_iteration(
                model,
                validation_data,
                steps_per_epoch=validation_steps,
                batch_size=batch_size,
                class_weight=class_weight,
                workers=workers,
                use_multiprocessing=use_multiprocessing,
                max_queue_size=max_queue_size,
                callbacks=callbacks,
                verbose=0,
                mode=ModeKeys.TEST,
                steps_name='validation_steps')

            if not isinstance(val_results, list):
                val_results = [val_results]
            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')

        if mode == ModeKeys.TRAIN:
            # Epochs only apply to `fit`.
            callbacks.on_epoch_end(epoch, epoch_logs)
        progbar.on_epoch_end(epoch, epoch_logs)

        # Recreate dataset iterator for the next epoch.
        if reset_dataset_after_each_epoch and epoch < epochs - 1:
            generator = dataset_ops.make_one_shot_iterator(original_dataset)

    callbacks._call_end_hook(mode)

    if enqueuer is not None:
        enqueuer.stop()

    if should_set_learning_phase:
        backend.set_eager_learning_phase(old_learning_phase)

    if mode == ModeKeys.TRAIN:
        return model.history
    return results
コード例 #15
0
def model_iteration(model,
                    inputs,
                    targets=None,
                    sample_weights=None,
                    batch_size=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    val_inputs=None,
                    val_targets=None,
                    val_sample_weights=None,
                    shuffle=True,
                    initial_epoch=0,
                    steps_per_epoch=None,
                    validation_steps=None,
                    validation_freq=1,
                    mode=ModeKeys.TRAIN,
                    validation_in_fit=False,
                    steps_name='steps',
                    **kwargs):
    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      inputs: Either a list or dictionary of arrays, or a dataset instance.
      targets: List/dictionary of input arrays.
      sample_weights: Optional list of sample weight arrays.
      batch_size: Integer batch size or None if unknown.
      epochs: Number of times to iterate over the data
      verbose: Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
      val_targets: List/dictionary of target arrays.
      val_sample_weights: Optional list of sample weight arrays.
      shuffle: Whether to shuffle the data at the beginning of each epoch
        concatenation of list the display names of the outputs of `f` and the
        list of display names of the outputs of `f_val`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run)
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      validation_steps: Number of steps to run validation for (only if doing
        validation from data tensors). Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      validation_in_fit: DEPRECATED: if true, then this method is invoked from
        within training iteration (for validation). In this case, do not copy
        weights when using a tf.distribute.Strategy. The input is deprecated as
        it is not required if the user creates a distributed model under the
        distribution strategy scope rather than passing it to compile.
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
    # Backwards compatibility.
    if 'steps' in kwargs:
        steps_per_epoch = kwargs.pop('steps')
    if kwargs:
        raise TypeError('Unknown arguments: %s' % (kwargs, ))

    # In case we were passed a dataset, we extract symbolic tensors from it.
    reset_dataset_after_each_epoch = False
    original_dataset = None
    is_dataset = isinstance(inputs,
                            (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
    # TODO(fchollet): consider moving `steps_per_epoch` inference to
    # _standardize_user_data and set reset_dataset_after_each_epoch as an
    # attribute on the dataset instance.
    if is_dataset:
        original_dataset = inputs
        if steps_per_epoch is None:
            reset_dataset_after_each_epoch = True
            steps_per_epoch = training_utils.infer_steps_for_dataset(
                inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)

    if mode == ModeKeys.TRAIN:
        _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)

    # Enter DistributionStrategy scope.
    if model._distribution_strategy:
        scope = model._distribution_strategy.scope()
        scope.__enter__()

    # Get step function and loop type.
    f = _make_execution_function(model, mode)
    use_steps = is_dataset or steps_per_epoch is not None
    do_validation = val_inputs is not None

    # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
    inputs, targets, sample_weights = training_utils. \
        convert_eager_tensors_to_numpy((inputs, targets, sample_weights))

    # Prepare input data.
    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
    if not is_dataset:
        num_samples_or_steps = _get_num_samples_or_steps(
            ins, batch_size, steps_per_epoch)
    else:
        num_samples_or_steps = steps_per_epoch

    # Configure callbacks.
    count_mode = 'steps' if use_steps else 'samples'
    callbacks = cbks.configure_callbacks(
        callbacks,
        model,
        do_validation=do_validation,
        batch_size=batch_size,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        samples=num_samples_or_steps,
        verbose=0,  # Handle ProgBarLogger separately in this loop.
        mode=mode)
    # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
    progbar = training_utils.get_progbar(model, count_mode)
    progbar.params = callbacks.params
    progbar.params['verbose'] = verbose

    # Find beforehand arrays that need sparse-to-dense conversion.
    if issparse is not None and not use_steps:
        indices_for_conversion_to_dense = []
        feed = _get_model_feed(model, mode)
        for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
            if issparse(input_data) and not K.is_sparse(feed_tensor):
                indices_for_conversion_to_dense.append(i)

    # Select aggregation method.
    if mode == ModeKeys.PREDICT:
        aggregator = training_utils.OutputsAggregator(use_steps,
                                                      num_samples_or_steps)
    else:
        aggregator = training_utils.MetricsAggregator(use_steps,
                                                      num_samples_or_steps)

    if model._compile_distribution and not validation_in_fit:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, model._distributed_model)

    callbacks.model.stop_training = False
    callbacks._call_begin_hook(mode)
    progbar.on_train_begin()

    for epoch in range(initial_epoch, epochs):
        if callbacks.model.stop_training:
            break

        # Setup work for each epoch
        epoch_logs = {}
        model.reset_metrics()
        if mode == ModeKeys.TRAIN:
            callbacks.on_epoch_begin(epoch, epoch_logs)
        progbar.on_epoch_begin(epoch, epoch_logs)

        if use_steps:
            # Step-wise loop.
            if steps_per_epoch is None:
                # Loop over dataset until `OutOfRangeError` is raised.
                target_steps = np.inf
            else:
                # Loop over dataset for the specified number of steps.
                target_steps = steps_per_epoch

            step = 0
            while step < target_steps:
                batch_logs = {'batch': step, 'size': 1}
                callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
                progbar.on_batch_begin(step, batch_logs)

                # Get outputs.
                try:
                    # `ins` can be callable in DistributionStrategy + eager case.
                    actual_inputs = ins() if callable(ins) else ins
                    batch_outs = f(actual_inputs)
                except errors.OutOfRangeError:
                    if original_dataset is None:
                        # We ran out of batches while the user passed an iterator (legacy).
                        logging.warning(
                            'Your dataset iterator ran out of data; '
                            'interrupting training. Make sure that your iterator '
                            'can generate at least `%s * epochs` '
                            'batches (in this case, %d batches). You may need to'
                            'use the repeat() function when building your '
                            'dataset.' %
                            (steps_name, steps_per_epoch * epochs))
                        callbacks.model.stop_training = True
                    else:
                        # The dataset passed by the user ran out of batches.
                        # Now we know the cardinality of the dataset.
                        if step > 0:
                            steps_per_epoch = step
                            aggregator.num_samples_or_steps = steps_per_epoch
                            progbar.params['steps'] = steps_per_epoch
                            progbar.progbar.target = steps_per_epoch
                    break

                if not isinstance(batch_outs, list):
                    batch_outs = [batch_outs]

                if model._distribution_strategy:
                    batch_outs = distributed_training_utils._per_device_aggregate_batch(
                        batch_outs, model, mode)

                # Aggregate results.
                if step == 0:
                    aggregator.create(batch_outs)
                aggregator.aggregate(batch_outs)

                # Callbacks batch end.
                batch_logs = cbks.make_logs(model, batch_logs, batch_outs,
                                            mode)
                callbacks._call_batch_hook(mode, 'end', step, batch_logs)
                progbar.on_batch_end(step, batch_logs)
                step += 1

                if callbacks.model.stop_training:
                    break
        else:
            # Sample-wise loop.
            index_array = np.arange(num_samples_or_steps)
            if shuffle == 'batch':
                index_array = training_utils.batch_shuffle(
                    index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)
            batches = make_batches(num_samples_or_steps, batch_size)

            for batch_index, (batch_start, batch_end) in enumerate(batches):
                batch_ids = index_array[batch_start:batch_end]

                # Slice into a batch.
                try:
                    if ins and isinstance(ins[-1], int):
                        # Do not slice the training phase flag.
                        ins_batch = slice_arrays(ins[:-1],
                                                 batch_ids) + [ins[-1]]
                    else:
                        ins_batch = slice_arrays(ins, batch_ids)
                except TypeError:
                    raise TypeError('TypeError while preparing batch. '
                                    'If using HDF5 input data, '
                                    'pass shuffle="batch".')

                # Sparse to dense conversion.
                if issparse is not None:
                    for i in indices_for_conversion_to_dense:
                        ins_batch[i] = ins_batch[i].toarray()

                # Callbacks batch_begin.
                batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
                callbacks._call_batch_hook(mode, 'begin', batch_index,
                                           batch_logs)
                progbar.on_batch_begin(batch_index, batch_logs)

                # Get outputs.
                batch_outs = f(ins_batch)
                if not isinstance(batch_outs, list):
                    batch_outs = [batch_outs]

                # Aggregate results.
                if batch_index == 0:
                    aggregator.create(batch_outs)
                aggregator.aggregate(batch_outs, batch_start, batch_end)

                # Callbacks batch end.
                batch_logs = cbks.make_logs(model, batch_logs, batch_outs,
                                            mode)
                callbacks._call_batch_hook(mode, 'end', batch_index,
                                           batch_logs)
                progbar.on_batch_end(batch_index, batch_logs)

                if callbacks.model.stop_training:
                    break

        aggregator.finalize()
        results = aggregator.results
        epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
        if len(results) == 1:
            results = results[0]

        # Run the test loop every `validation_freq` epochs during training.
        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch) and not callbacks.model.stop_training):
            val_results = model_iteration(model,
                                          val_inputs,
                                          targets=val_targets,
                                          sample_weights=val_sample_weights,
                                          batch_size=batch_size,
                                          steps_per_epoch=validation_steps,
                                          callbacks=callbacks,
                                          verbose=0,
                                          mode=ModeKeys.TEST,
                                          validation_in_fit=True,
                                          steps_name='validation_steps')
            if not isinstance(val_results, list):
                val_results = [val_results]
            epoch_logs = cbks.make_logs(model,
                                        epoch_logs,
                                        val_results,
                                        mode,
                                        prefix='val_')

        if mode == ModeKeys.TRAIN:
            # Epochs only apply to `fit`.
            callbacks.on_epoch_end(epoch, epoch_logs)
        progbar.on_epoch_end(epoch, epoch_logs)

        # Recreate dataset iterator for the next epoch.
        if reset_dataset_after_each_epoch and epoch < epochs - 1:
            ins = _prepare_feed_values(model, original_dataset, None, None,
                                       mode)

    callbacks._call_end_hook(mode)

    if model._distribution_strategy:
        if model._compile_distribution and not validation_in_fit:
            # TODO(priyag, psv): Copy back metrics to the original model as well?
            distributed_training_utils._copy_weights_to_original_model(
                model, model._distributed_model, mode)
        scope.__exit__(None, None, None)

    if mode == ModeKeys.TRAIN:
        return model.history
    return results