def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
    """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    mode = ModeKeys.TRAIN
    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)
    steps_per_epoch = training_utils.infer_steps_for_dataset(
        dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
    if (current_strategy.extended.steps_per_run != 1
            and steps_per_epoch is None):
        raise ValueError('`steps_per_epoch` should be specified when calling '
                         '`fit` on the model with TPUStrategy when '
                         '`steps_per_run` != 1 .')

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=1)
    scope.__enter__()

    def _per_device_fit_function(model):
        model._make_fit_function()
        return (model._fit_function.inputs, model._fit_function.outputs,
                model._fit_function.updates_op,
                model._fit_function.session_kwargs)

    out_labels = model.metrics_names or []

    def step_fn(ctx, inputs):
        """Clones the model and calls make_fit_function."""
        inputs, targets = inputs
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model, current_strategy, mode, inputs=inputs, targets=targets)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, mode, inputs, targets)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_fit_function,
             args=(distributed_training_utils.get_distributed_model(
                 model, ModeKeys.TRAIN), ))
        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)
        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_fit_function',
                                 **all_session_args)

        for label, output in zip(out_labels, combined_fn.outputs):
            if label == 'loss':
                reduce_op = ds_reduce_util.ReduceOp.SUM
            else:
                # We reduce all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                reduce_op = ds_reduce_util.ReduceOp.MEAN
            ctx.set_last_step_output(label, output, reduce_op)

        # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
        # feed_dict, session kwargs, run options, run_metadata for now. These should
        # be handled appropriately
        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_stateful_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    use_steps = steps_per_epoch is not None
    if use_steps:
        iteration_value = min(steps_per_epoch,
                              current_strategy.extended.steps_per_run)
    else:
        iteration_value = current_strategy.extended.steps_per_run

    steps_per_run = K.variable(value=iteration_value,
                               dtype='int32',
                               name='steps_per_run')
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=steps_per_run,
        initial_loop_values=initial_loop_values)
    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    do_validation = bool(validation_steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=mode)

    # Calculate the steps each time on the device.
    if use_steps:
        steps_to_run = (
            [current_strategy.extended.steps_per_run] *
            (steps_per_epoch // current_strategy.extended.steps_per_run))
        if steps_per_epoch % current_strategy.extended.steps_per_run:
            steps_to_run.append(steps_per_epoch %
                                current_strategy.extended.steps_per_run)
        target_steps = len(steps_to_run)
    else:
        target_steps = np.inf

    callbacks._call_begin_hook(mode)
    for epoch in range(initial_epoch, epochs):
        distributed_training_utils._reset_metrics(model)
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        current_step = 0
        while current_step < target_steps:
            step_count = steps_to_run[current_step] if use_steps else 1
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                steps_per_run.load(step_count, K.get_session())
                prev_step_count = step_count
            try:
                _, outputs = K.get_session().run([train_op, output_tensors])
            except errors.OutOfRangeError:
                if use_steps:
                    logging.warning(
                        'Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
                else:
                    target_steps = current_step
                    logging.info(
                        'Dataset iterator ran out of data. Inferring the '
                        'value of `steps_per_epoch` as %s  .' % target_steps)
                    distributed_training_utils.initialize_iterator(
                        iterator, current_strategy)
                break

            batch_logs.update(outputs)
            callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
            step_index = step_index + step_count
            current_step += 1

            if callbacks.model.stop_training:
                break

        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch)):
            logging.info('Running validation at fit epoch: %s', epoch)

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                distributed_training_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
                model,
                val_dataset,
                steps=validation_steps,
                verbose=verbose,
                callbacks=callbacks)
            if not isinstance(val_outs, list):
                val_outs = [val_outs]
            # Same labels assumed.
            for label, val_out in zip(out_labels, val_outs):
                epoch_logs['val_' + label] = val_out

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks._call_end_hook(mode)

    if model._compile_distribution:
        # Copy the weights back from the replicated model to the original model.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)
    scope.__exit__(None, None, None)
    return model.history
Example #2
0
def _reinitialize_iterator(iterator, distribution_strategy=None):
  if distribution_strategy:
    distributed_training_utils.initialize_iterator(
        iterator, distribution_strategy)
  else:
    training_utils.initialize_iterator(iterator)
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
  if (current_strategy.extended.steps_per_run != 1 and
      steps_per_epoch is None):
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model with TPUStrategy when '
                     '`steps_per_run` != 1 .')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  use_steps = steps_per_epoch is not None
  if use_steps:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    iteration_value = current_strategy.extended.steps_per_run

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  if use_steps:
    steps_to_run = ([current_strategy.extended.steps_per_run] *
                    (steps_per_epoch //
                     current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
      steps_to_run.append(
          steps_per_epoch % current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)
  else:
    target_steps = np.inf

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step] if use_steps else 1
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        if use_steps:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches).' %
                          steps_per_epoch * epochs)
        else:
          target_steps = current_step
          logging.info('Dataset iterator ran out of data. Inferring the '
                       'value of `steps_per_epoch` as %s  .' % target_steps)
          distributed_training_utils.initialize_iterator(iterator,
                                                         current_strategy)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history