Beispiel #1
0
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
    """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
    dataset_fully_shaped = (
        distributed_training_utils.is_dataset_shape_fully_defined(dataset))
    padding_handler = None
    if not dataset_fully_shaped:
        # TODO(hongjunchoi): Investigate whether operations from
        # PartialBatchPaddingHandler are unnecessarily pruned out
        # during graph optimization.
        padding_handler = padding_util.PartialBatchPaddingHandler(
            model._feed_output_shapes)
        batched_dataset = input_lib._get_batched_dataset(dataset)
        batch_size, _, prefetch_buffer = input_lib._get_batched_dataset_attributes(
            batched_dataset)
        padding_handler.padded_batch_size = batch_size
        padding_handler.padding_mask = dataset.reduce(
            padding_handler.padding_mask, padding_handler.update_mask)

        dataset = dataset.map(padding_handler.pad_batch)
        dataset = dataset.apply(batching.unbatch())
        # Upon this point, it is guaranteed that the dataset does not
        # have partial batches. Thus, we set `drop_remainder=True` to
        # get static shape information about the elements in the dataset.
        dataset = dataset.batch(batch_size, drop_remainder=True)

        if prefetch_buffer is not None:
            dataset = dataset.prefetch(prefetch_buffer)

    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    def _per_device_predict_function(model):
        model._make_predict_function()
        return (model.predict_function.inputs, model.predict_function.outputs,
                model.predict_function.updates_op,
                model.predict_function.session_kwargs)

    def step_fn(ctx, inputs):
        """Clones the model and calls make_predict_function."""
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model, current_strategy, ModeKeys.PREDICT, inputs=inputs)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, ModeKeys.PREDICT, inputs)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_predict_function,
             args=(model._distributed_model_predict, ))

        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)

        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_predict_function',
                                 **all_session_args)

        for label, output in zip(model.output_names, combined_fn.outputs):
            ctx.set_last_step_output(label, output)

        return combined_fn.updates_op

    # Add initial dummy values for outputs.
    initial_loop_values = {}
    batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
    for name, tensor in zip(model.output_names, model.outputs):
        # TODO(priyag): This is a workaround as we do not know the batch dimension
        # of the model's output at this point.
        shape = tensor_shape.TensorShape(tensor.shape.dims)
        shape.dims = [batch_dimension] + shape.dims[1:]
        initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=1,
        initial_loop_values=initial_loop_values)

    predict_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, ModeKeys.PREDICT)

    distributed_training_utils._reset_metrics(model)

    assert steps is not None
    # Since we do not know how many samples we will see, we cannot pre-allocate
    # the returned Numpy arrays. Instead, we store one array per batch seen
    # and concatenate them upon returning.
    unconcatenated_outs = [[] for _ in model.outputs]
    for step in range(steps):
        _, batch_outs = K.get_session().run([predict_op, output_tensors])
        # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
        for i, label in enumerate(model.output_names):
            unconcatenated_outs[i].extend(batch_outs[label])
        if verbose >= 1:
            progbar.update(step + 1)

    scope.__exit__(None, None, None)

    if len(unconcatenated_outs) == 1:
        prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
    else:
        prediction_result = [
            np.concatenate(unconcatenated_outs[i], axis=0)
            for i in range(len(unconcatenated_outs))
        ]

    if padding_handler:
        prediction_result = padding_handler.apply_mask(prediction_result)

    return prediction_result
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs):
    """Clones the model and calls make_fit_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_fit_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TRAIN),))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is None:
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model.')
  steps_per_run = K.variable(
      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
      dtype='int32',
      name='steps_per_run')

  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  steps_to_run = [current_strategy.extended.steps_per_run] * (
      steps_per_epoch // current_strategy.extended.steps_per_run)
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    for step_count in steps_to_run:
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
Beispiel #3
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
    """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=1)
    scope.__enter__()

    def _per_device_fit_function(model):
        model._make_fit_function()
        return (model._fit_function.inputs, model._fit_function.outputs,
                model._fit_function.updates_op,
                model._fit_function.session_kwargs)

    out_labels = model.metrics_names or []

    def step_fn(ctx, inputs):
        """Clones the model and calls make_fit_function."""
        inputs, targets = inputs
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model,
                current_strategy,
                ModeKeys.TRAIN,
                inputs=inputs,
                targets=targets)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, ModeKeys.TRAIN, inputs, targets)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_fit_function, args=(model._distributed_model_train, ))
        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)
        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_fit_function',
                                 **all_session_args)

        for label, output in zip(out_labels, combined_fn.outputs):
            if label == 'loss':
                reduce_op = distribute_lib.get_loss_reduction()
            else:
                # We reduce all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                reduce_op = ds_reduce_util.ReduceOp.MEAN
            ctx.set_last_step_output(label, output, reduce_op)

        # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
        # feed_dict, session kwargs, run options, run_metadata for now. These should
        # be handled appropriately
        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_stateful_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    if steps_per_epoch is None:
        raise ValueError('`steps_per_epoch` should be specified when calling '
                         '`fit` on the model.')
    steps_per_run = K.variable(value=min(
        steps_per_epoch, current_strategy.extended.steps_per_run),
                               dtype='int32',
                               name='steps_per_run')

    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=steps_per_run,
        initial_loop_values=initial_loop_values)

    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    do_validation = bool(validation_steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, ModeKeys.TRAIN)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose)

    # Calculate the steps each time on the device.
    steps_to_run = [current_strategy.extended.steps_per_run] * (
        steps_per_epoch // current_strategy.extended.steps_per_run)
    if steps_per_epoch % current_strategy.extended.steps_per_run:
        steps_to_run.append(steps_per_epoch %
                            current_strategy.extended.steps_per_run)

    callbacks.on_train_begin()
    for epoch in range(initial_epoch, epochs):
        distributed_training_utils._reset_metrics(model)
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        for step_count in steps_to_run:
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks.on_batch_begin(step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                steps_per_run.load(step_count, K.get_session())
                prev_step_count = step_count
            try:
                _, outputs = K.get_session().run([train_op, output_tensors])
            except errors.OutOfRangeError:
                logging.warning(
                    'Your dataset iterator ran out of data; '
                    'interrupting training. Make sure that your dataset '
                    'can generate at least `steps_per_epoch * epochs` '
                    'batches (in this case, %d batches).' % steps_per_epoch *
                    epochs)
                break

            batch_logs.update(outputs)
            callbacks.on_batch_end(step_index, batch_logs)
            step_index = step_index + step_count
            if callbacks.model.stop_training:
                break

        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch)):
            logging.info('Running validation at fit epoch: %s', epoch)

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                distributed_training_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
                model,
                val_dataset,
                steps=validation_steps,
                verbose=verbose)
            if not isinstance(val_outs, list):
                val_outs = [val_outs]
            # Same labels assumed.
            for label, val_out in zip(out_labels, val_outs):
                epoch_logs['val_' + label] = val_out

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks.on_train_end()

    if model._compile_distribution:
        # Copy the weights back from the replicated model to the original model.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)
    scope.__exit__(None, None, None)
    return model.history
Beispiel #4
0
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None):
    """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)
    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    def _per_device_eval_function(model):
        model._make_eval_function()
        return (model._eval_function.inputs, model._eval_function.outputs,
                model._eval_function.updates_op,
                model._eval_function.session_kwargs)

    def step_fn(ctx, inputs):
        """Clones the model and calls make_eval_function."""
        inputs, targets = inputs
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model,
                current_strategy,
                mode=ModeKeys.TEST,
                inputs=inputs,
                targets=targets)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, ModeKeys.TEST, inputs, targets)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_eval_function, args=(model._distributed_model_test, ))

        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)

        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_test_function',
                                 **all_session_args)

        for label, output in zip(model.metrics_names, combined_fn.outputs):
            if label == 'loss':
                reduce_op = distribute_lib.get_loss_reduction()
            else:
                # We reduce all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                reduce_op = ds_reduce_util.ReduceOp.MEAN
            ctx.set_last_step_output(label, output, reduce_op)

        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_stateful_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    # TODO(priyag): Use steps_per_run when we use new metrics as they will
    # allow handling metric computation at each step using variables.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=1,
        initial_loop_values=initial_loop_values)

    test_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, ModeKeys.TEST)

    distributed_training_utils._reset_metrics(model)

    assert steps is not None
    outs = [0.] * len(model.metrics_names)
    for step in range(steps):
        _, batch_outs = K.get_session().run([test_op, output_tensors])
        for i, label in enumerate(model.metrics_names):
            if i == 0:
                # Loss is stateless metrics.
                outs[i] += batch_outs[label]
            else:
                # For all stateful metrics, the aggregation is handled by mirrored vars.
                outs[i] = batch_outs[label]

        if verbose >= 1:
            progbar.update(step + 1)

    scope.__exit__(None, None, None)
    if len(outs) >= 0:
        outs[0] /= (steps)

    if len(outs) == 1:
        return outs[0]
    return outs
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
    """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
    mode = ModeKeys.TEST
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)
    steps = training_utils.infer_steps_for_dataset(dataset,
                                                   steps,
                                                   steps_name='steps')

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    def _per_device_eval_function(model):
        model._make_eval_function()
        return (model._eval_function.inputs, model._eval_function.outputs,
                model._eval_function.updates_op,
                model._eval_function.session_kwargs)

    def step_fn(ctx, inputs):
        """Clones the model and calls make_eval_function."""
        inputs, targets = inputs
        if model._compile_distribution:
            distributed_training_utils.clone_model_on_replicas(
                model,
                current_strategy,
                mode=mode,
                inputs=inputs,
                targets=targets)
        else:
            distributed_training_utils._build_distributed_network(
                model, current_strategy, mode, inputs, targets)

        (grouped_inputs, grouped_outputs, grouped_updates, grouped_session_args
         ) = current_strategy.extended.call_for_each_replica(
             _per_device_eval_function,
             args=(distributed_training_utils.get_distributed_model(
                 model, ModeKeys.TEST), ))

        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)

        combined_fn = K.function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_test_function',
                                 **all_session_args)

        for label, output in zip(model.metrics_names, combined_fn.outputs):
            if label == 'loss':
                reduce_op = ds_reduce_util.ReduceOp.SUM
            else:
                # We reduce all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                reduce_op = ds_reduce_util.ReduceOp.MEAN
            ctx.set_last_step_output(label, output, reduce_op)

        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_stateful_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    # TODO(priyag): Use steps_per_run when we use new metrics as they will
    # allow handling metric computation at each step using variables.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=1,
        initial_loop_values=initial_loop_values)

    test_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    distributed_training_utils._reset_metrics(model)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=False,
                                         epochs=1,
                                         steps_per_epoch=steps,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=ModeKeys.TEST)
    callbacks._call_begin_hook(mode)

    outs = [0.] * len(model.metrics_names)
    if steps is not None:
        target_steps = steps
    else:
        target_steps = np.inf

    current_step = 0
    while current_step < target_steps:
        batch_logs = {'batch': current_step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
        try:
            _, batch_outs = K.get_session().run([test_op, output_tensors])
        except errors.OutOfRangeError:
            if steps is not None:
                warning_msg = 'Make sure that your dataset can generate at least '
                '`steps` batches (in this case, {} batches).'.format(steps)
            else:
                warning_msg = 'Number of steps ran: {} steps'.format(
                    current_step)

            logging.warning('Your dataset iterator ran out of data; '
                            'interrupting evaluation. ' + warning_msg)
            target_steps = current_step
            break
        for i, label in enumerate(model.metrics_names):
            if i == 0:
                # Loss is stateless metrics.
                outs[i] += batch_outs[label]
            else:
                # For all stateful metrics, the aggregation is handled by mirrored vars.
                outs[i] = batch_outs[label]

        batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
        callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
        if verbose >= 1:
            progbar.update(current_step + 1)
        current_step += 1

    callbacks._call_end_hook(mode)

    scope.__exit__(None, None, None)
    if len(outs) >= 0:
        outs[0] /= (target_steps)

    if len(outs) == 1:
        return outs[0]
    return outs
Beispiel #6
0
def model_iteration(model,
                    inputs,
                    targets=None,
                    sample_weights=None,
                    batch_size=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    val_inputs=None,
                    val_targets=None,
                    val_sample_weights=None,
                    shuffle=True,
                    initial_epoch=0,
                    steps_per_epoch=None,
                    validation_steps=None,
                    validation_freq=1,
                    mode=ModeKeys.TRAIN,
                    validation_in_fit=False,
                    prepared_feed_values_from_dataset=False,
                    steps_name='steps',
                    **kwargs):
  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      inputs: Either a list or dictionary of arrays, or a dataset instance.
      targets: List/dictionary of input arrays.
      sample_weights: Optional list of sample weight arrays.
      batch_size: Integer batch size or None if unknown.
      epochs: Number of times to iterate over the data
      verbose: Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
      val_targets: List/dictionary of target arrays.
      val_sample_weights: Optional list of sample weight arrays.
      shuffle: Whether to shuffle the data at the beginning of each epoch
        concatenation of list the display names of the outputs of `f` and the
        list of display names of the outputs of `f_val`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run)
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      validation_steps: Number of steps to run validation for (only if doing
        validation from data tensors). Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
        `collections.Container` instance (e.g. list, tuple, etc.). If an
        integer, specifies how many training epochs to run before a new
        validation run is performed, e.g. `validation_freq=2` runs
        validation every 2 epochs. If a Container, specifies the epochs on
        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
        validation at the end of the 1st, 2nd, and 10th epochs.
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      validation_in_fit: if true, then this method is invoked from within
        training iteration (for validation). In the case where `val_inputs` is a
        dataset, this flag indicates that its iterator and feed values are
        already created so should properly reuse resources.
      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
        tensors returned from `_prepare_feed_values` call on the validation
        dataset, so do not call it again on `inputs`. Should only be used for
        inline validation (i.e., only if `validation_in_fit` is also True).
      steps_name: The string name of the steps argument, either `steps`,
        `validation_steps`, or `steps_per_epoch`. Only used for error message
        formatting.
      **kwargs: Additional arguments for backwards compatibility.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
  # Backwards compatibility.
  if 'steps' in kwargs:
    steps_per_epoch = kwargs.pop('steps')
  if kwargs:
    raise TypeError('Unknown arguments: %s' % (kwargs,))

  # In case we were passed a dataset, we extract symbolic tensors from it.
  reset_dataset_after_each_epoch = False
  input_iterator = None
  is_dataset = isinstance(inputs,
                          (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
  # TODO(fchollet): consider moving `steps_per_epoch` inference to
  # _standardize_user_data and set reset_dataset_after_each_epoch as an
  # attribute on the dataset instance.
  if is_dataset:
    if steps_per_epoch is None:
      reset_dataset_after_each_epoch = True
      steps_per_epoch = training_utils.infer_steps_for_dataset(
          inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
    input_iterator = _get_iterator(inputs, model._distribution_strategy)

  if mode == ModeKeys.TRAIN:
    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)

  # Enter DistributionStrategy scope.
  if model._distribution_strategy:
    scope = distributed_training_utils.distributed_scope(
        strategy=model._distribution_strategy,
        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
    scope.__enter__()

  # Get step function and loop type.
  f = _make_execution_function(model, mode)
  use_steps = is_dataset or steps_per_epoch is not None
  do_validation = val_inputs is not None

  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
  inputs, targets, sample_weights = training_utils. \
      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))

  # Prepare input data.
  inputs = input_iterator or inputs
  if validation_in_fit and prepared_feed_values_from_dataset:
    # When invoking validation in training loop, avoid creating iterator and
    # list of feed values for the same validation dataset multiple times (which
    # essentially would call `iterator.get_next()` that slows down execution and
    # leads to OOM errors eventually.
    ins = inputs
  else:
    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
  if not is_dataset:
    num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                     steps_per_epoch)
  else:
    num_samples_or_steps = steps_per_epoch

  # Prepare validation data. Hold references to the iterator and the input list
  # to properly reinitialize and reuse in multiple validation passes.
  val_iterator = None
  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
    if validation_steps is None:
      # Because we pass an iterator feed instead of a Dataset to the eval
      # model_iteration() call, it will not trigger the dataset-input path
      # that determines the number of steps required. To avoid this issue,
      # set validation_steps here if validation_steps is None.
      validation_steps = training_utils.infer_steps_for_dataset(
          val_inputs,
          validation_steps,
          epochs=epochs,
          steps_name='validation_steps')
    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
    val_inputs = _prepare_feed_values(
        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)

  # Configure callbacks.
  count_mode = 'steps' if use_steps else 'samples'
  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      batch_size=batch_size,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      samples=num_samples_or_steps,
      verbose=0,  # Handle ProgBarLogger separately in this loop.
      mode=mode)
  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
  progbar = training_utils.get_progbar(model, count_mode)
  progbar.params = callbacks.params
  progbar.params['verbose'] = verbose

  # Find beforehand arrays that need sparse-to-dense conversion.
  if issparse is not None and not use_steps:
    indices_for_conversion_to_dense = []
    feed = _get_model_feed(model, mode)
    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
      if issparse(input_data) and not K.is_sparse(feed_tensor):
        indices_for_conversion_to_dense.append(i)

  # Select aggregation method.
  if mode == ModeKeys.PREDICT:
    aggregator = training_utils.OutputsAggregator(use_steps,
                                                  num_samples_or_steps)
  else:
    aggregator = training_utils.MetricsAggregator(use_steps,
                                                  num_samples_or_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks.model.stop_training = False
  callbacks._call_begin_hook(mode)
  progbar.on_train_begin()

  for epoch in range(initial_epoch, epochs):
    if callbacks.model.stop_training:
      break

    # Setup work for each epoch
    epoch_logs = {}
    model.reset_metrics()
    if mode == ModeKeys.TRAIN:
      callbacks.on_epoch_begin(epoch, epoch_logs)
    progbar.on_epoch_begin(epoch, epoch_logs)

    if use_steps:
      # Step-wise loop.
      if steps_per_epoch is None:
        # Loop over dataset until `OutOfRangeError` is raised.
        target_steps = np.inf
      else:
        # Loop over dataset for the specified number of steps.
        target_steps = steps_per_epoch

      step = 0
      while step < target_steps:
        batch_logs = {'batch': step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
        progbar.on_batch_begin(step, batch_logs)

        # Get outputs.
        try:
          # `ins` can be callable in DistributionStrategy + eager case.
          actual_inputs = ins() if callable(ins) else ins
          batch_outs = f(actual_inputs)
        except errors.OutOfRangeError:
          if is_dataset:
            # The dataset passed by the user ran out of batches.
            # Now we know the cardinality of the dataset.
            # If steps_per_epoch was specified, then running out of data is
            # unexpected, so we stop training and inform the user.
            if steps_per_epoch:
              callbacks.model.stop_training = True
              logging.warning(
                  'Your dataset ran out of data; interrupting training. '
                  'Make sure that your dataset can generate at least '
                  '`%s * epochs` batches (in this case, %d batches). '
                  'You may need to use the repeat() function when '
                  'building your dataset.'
                  % (steps_name, steps_per_epoch * epochs))
            elif step > 0:
              steps_per_epoch = step
              aggregator.num_samples_or_steps = steps_per_epoch
              if mode == ModeKeys.TRAIN:
                progbar.params['steps'] = steps_per_epoch
                progbar.progbar.target = steps_per_epoch
          else:
            # We ran out of batches while the user passed an iterator (legacy).
            callbacks.model.stop_training = True
            logging.warning(
                'Your dataset iterator ran out of data; '
                'interrupting training. Make sure that your iterator '
                'can generate at least `%s * epochs` '
                'batches (in this case, %d batches). You may need to'
                'use the repeat() function when building your '
                'dataset.' % (steps_name, steps_per_epoch * epochs))
          break

        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        if model._distribution_strategy:
          batch_outs = distributed_training_utils._per_device_aggregate_batch(
              batch_outs, model, mode)

        # Aggregate results.
        if step == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
        progbar.on_batch_end(step, batch_logs)
        step += 1

        if callbacks.model.stop_training:
          break
    else:
      # Sample-wise loop.
      index_array = np.arange(num_samples_or_steps)
      if shuffle == 'batch':
        index_array = training_utils.batch_shuffle(index_array, batch_size)
      elif shuffle:
        np.random.shuffle(index_array)
      batches = make_batches(num_samples_or_steps, batch_size)

      for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]

        # Slice into a batch.
        try:
          if ins and isinstance(ins[-1], int):
            # Do not slice the training phase flag.
            ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
          else:
            ins_batch = slice_arrays(ins, batch_ids)
        except TypeError:
          raise TypeError('TypeError while preparing batch. '
                          'If using HDF5 input data, '
                          'pass shuffle="batch".')

        # Sparse to dense conversion.
        if issparse is not None:
          for i in indices_for_conversion_to_dense:
            ins_batch[i] = ins_batch[i].toarray()

        # Callbacks batch_begin.
        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
        progbar.on_batch_begin(batch_index, batch_logs)

        # Get outputs.
        batch_outs = f(ins_batch)
        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        # Aggregate results.
        if batch_index == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs, batch_start, batch_end)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
        progbar.on_batch_end(batch_index, batch_logs)

        if callbacks.model.stop_training:
          break

    aggregator.finalize()
    results = aggregator.results
    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
    if len(results) == 1:
      results = results[0]

    # Run the test loop every `validation_freq` epochs during training.
    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch) and
        not callbacks.model.stop_training):

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_results = model_iteration(
          model,
          val_inputs,
          targets=val_targets,
          sample_weights=val_sample_weights,
          batch_size=batch_size,
          steps_per_epoch=validation_steps,
          callbacks=callbacks,
          verbose=0,
          mode=ModeKeys.TEST,
          validation_in_fit=True,
          prepared_feed_values_from_dataset=(val_iterator is not None),
          steps_name='validation_steps')
      if not isinstance(val_results, list):
        val_results = [val_results]
      epoch_logs = cbks.make_logs(
          model, epoch_logs, val_results, mode, prefix='val_')
      if val_iterator and epoch < epochs - 1:
        _reinitialize_iterator(val_iterator, model._distribution_strategy)

    if mode == ModeKeys.TRAIN:
      # Epochs only apply to `fit`.
      callbacks.on_epoch_end(epoch, epoch_logs)
    progbar.on_epoch_end(epoch, epoch_logs)

    # Reinitialize dataset iterator for the next epoch.
    if reset_dataset_after_each_epoch and epoch < epochs - 1:
      _reinitialize_iterator(input_iterator, model._distribution_strategy)

  callbacks._call_end_hook(mode)

  if model._distribution_strategy:
    if model._compile_distribution:
      # TODO(priyag, psv): Copy back metrics to the original model as well?
      distributed_training_utils._copy_weights_to_original_model(model, mode)
    scope.__exit__(None, None, None)

  if mode == ModeKeys.TRAIN:
    return model.history
  return results
Beispiel #7
0
def experimental_tpu_predict_loop(model,
                                  dataset,
                                  verbose=0,
                                  steps=None,
                                  callbacks=None):
    """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
    mode = ModeKeys.PREDICT
    steps = training_utils.infer_steps_for_dataset(dataset,
                                                   steps,
                                                   steps_name='steps')
    dataset_fully_shaped = (
        distributed_training_utils.is_dataset_shape_fully_defined(dataset))
    padding_handler = None
    if not dataset_fully_shaped:
        # TODO(hongjunchoi): Investigate whether operations from
        # PartialBatchPaddingHandler are unnecessarily pruned out
        # during graph optimization.
        padding_handler = padding_util.PartialBatchPaddingHandler(
            model._feed_output_shapes)
        batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(
            dataset)
        padding_handler.padded_batch_size = batch_size
        padding_handler.padding_mask = dataset.reduce(
            padding_handler.padding_mask, padding_handler.update_mask)

        dataset = dataset.map(padding_handler.pad_batch)
        dataset = dataset.apply(batching.unbatch())
        # Upon this point, it is guaranteed that the dataset does not
        # have partial batches. Thus, we set `drop_remainder=True` to
        # get static shape information about the elements in the dataset.
        dataset = dataset.batch(batch_size, drop_remainder=True)

        if prefetch_buffer is not None:
            dataset = dataset.prefetch(prefetch_buffer)

    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    out_labels = model.output_names
    step_fn = _make_step_fn(model, ModeKeys.PREDICT, current_strategy,
                            out_labels)

    # Add initial dummy values for outputs.
    initial_loop_values = {}
    batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
    for name, tensor in zip(model.output_names, model.outputs):
        # TODO(priyag): This is a workaround as we do not know the batch dimension
        # of the model's output at this point.
        shape = tensor_shape.TensorShape(tensor.shape.dims)
        shape.dims = [batch_dimension] + shape.dims[1:]
        initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=1,
        initial_loop_values=initial_loop_values)

    predict_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    distributed_training_utils._reset_metrics(model)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=False,
                                         epochs=1,
                                         steps_per_epoch=steps,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=mode)
    callbacks._call_begin_hook(mode)

    # Since we do not know how many samples we will see, we cannot pre-allocate
    # the returned Numpy arrays. Instead, we store one array per batch seen
    # and concatenate them upon returning.
    unconcatenated_outs = [[] for _ in model.outputs]
    if steps is not None:
        target_steps = steps
    else:
        target_steps = np.inf

    current_step = 0
    while current_step < target_steps:
        batch_logs = {'batch': current_step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
        try:
            _, batch_outs = K.batch_get_value([predict_op, output_tensors])
        except errors.OutOfRangeError:
            if steps is not None:
                warning_msg = 'Make sure that your dataset can generate at least '
                '`steps` batches (in this case, {} batches).'.format(steps)
            else:
                warning_msg = 'Number of steps ran: {} steps'.format(
                    current_step)

            logging.warning('Your dataset iterator ran out of data; '
                            'interrupting evaluation. ' + warning_msg)
            break

        # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
        for i, label in enumerate(model.output_names):
            unconcatenated_outs[i].extend(batch_outs[label])
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
        if verbose >= 1:
            progbar.update(current_step + 1)
        current_step += 1

    callbacks._call_end_hook(mode)

    scope.__exit__(None, None, None)

    if len(unconcatenated_outs) == 1:
        prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
    else:
        prediction_result = [
            np.concatenate(unconcatenated_outs[i], axis=0)
            for i in range(len(unconcatenated_outs))
        ]

    if padding_handler:
        prediction_result = padding_handler.apply_mask(prediction_result)

    return prediction_result
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  mode = ModeKeys.TEST
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset,
                                                     current_strategy)
  steps = training_utils.infer_steps_for_dataset(dataset, steps,
                                                 steps_name='steps')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  out_labels = model.metrics_names
  step_fn = _make_step_fn(model, ModeKeys.TEST, current_strategy, out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  # TODO(priyag): Use steps_per_run when we use new metrics as they will
  # allow handling metric computation at each step using variables.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=ModeKeys.TEST)
  callbacks._call_begin_hook(mode)

  outs = [0.] * len(model.metrics_names)
  if steps is not None:
    target_steps = steps
  else:
    target_steps = np.inf

  current_step = 0
  while current_step < target_steps:
    batch_logs = {'batch': current_step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
    try:
      _, batch_outs = K.batch_get_value([test_op, output_tensors])
    except errors.OutOfRangeError:
      if steps is not None:
        warning_msg = 'Make sure that your dataset can generate at least '
        '`steps` batches (in this case, {} batches).'.format(steps)
      else:
        warning_msg = 'Number of steps ran: {} steps'.format(current_step)

      logging.warning('Your dataset iterator ran out of data; '
                      'interrupting evaluation. ' + warning_msg)
      target_steps = current_step
      break
    for i, label in enumerate(model.metrics_names):
      if i == 0:
        # Loss is stateless metrics.
        outs[i] += batch_outs[label]
      else:
        # For all stateful metrics, the aggregation is handled by mirrored vars.
        outs[i] = batch_outs[label]

    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
    if verbose >= 1:
      progbar.update(current_step + 1)
    current_step += 1

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)
  if len(outs) >= 0:
    outs[0] /= (target_steps)

  if len(outs) == 1:
    return outs[0]
  return outs
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  scope = current_strategy.scope()
  scope.__enter__()

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      distributed_training_utils. clone_model_on_replicas(
          model, current_strategy,
          make_callback_model=False, inputs=inputs,
          mode=distributed_training_utils.ModeKeys.PREDICT)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, inputs,
          mode=distributed_training_utils.ModeKeys.PREDICT)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function, args=(model._distributed_model_predict,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(
        model, model._distributed_model_predict)

  distributed_training_utils._reset_metrics(
      model, model._distributed_model_predict)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    if verbose >= 1:
      progbar.update(step + 1)

  scope.__exit__(None, None, None)
  if len(unconcatenated_outs) == 1:
    return np.concatenate(unconcatenated_outs[0], axis=0)
  return [
      np.concatenate(unconcatenated_outs[i], axis=0)
      for i in range(len(unconcatenated_outs))
  ]
Beispiel #10
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
    """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    mode = ModeKeys.TRAIN
    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)
    steps_per_epoch = training_utils.infer_steps_for_dataset(
        dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
    if (current_strategy.extended.steps_per_run != 1
            and steps_per_epoch is None):
        raise ValueError('`steps_per_epoch` should be specified when calling '
                         '`fit` on the model with TPUStrategy when '
                         '`steps_per_run` != 1 .')

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=1)
    scope.__enter__()

    out_labels = model.metrics_names or []

    step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy,
                            out_labels)

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name in model.metrics_names[1:]:
        tensor = model._all_metrics_tensors[name]
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    use_steps = steps_per_epoch is not None
    if use_steps:
        iteration_value = min(steps_per_epoch,
                              current_strategy.extended.steps_per_run)
    else:
        iteration_value = current_strategy.extended.steps_per_run

    steps_per_run = K.variable(value=iteration_value,
                               dtype='int32',
                               name='steps_per_run')
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=steps_per_run,
        initial_loop_values=initial_loop_values)
    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    do_validation = bool(validation_steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=mode)

    # Calculate the steps each time on the device.
    if use_steps:
        steps_to_run = (
            [current_strategy.extended.steps_per_run] *
            (steps_per_epoch // current_strategy.extended.steps_per_run))
        if steps_per_epoch % current_strategy.extended.steps_per_run:
            steps_to_run.append(steps_per_epoch %
                                current_strategy.extended.steps_per_run)
        target_steps = len(steps_to_run)
    else:
        target_steps = np.inf

    callbacks._call_begin_hook(mode)
    for epoch in range(initial_epoch, epochs):
        distributed_training_utils._reset_metrics(model)
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        current_step = 0
        while current_step < target_steps:
            step_count = steps_to_run[current_step] if use_steps else 1
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                steps_per_run.load(step_count, K.get_session())
                prev_step_count = step_count
            try:
                _, outputs = K.batch_get_value([train_op, output_tensors])
            except errors.OutOfRangeError:
                if use_steps:
                    logging.warning(
                        'Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
                else:
                    target_steps = current_step
                    logging.info(
                        'Dataset iterator ran out of data. Inferring the '
                        'value of `steps_per_epoch` as %s  .' % target_steps)
                    distributed_training_utils.initialize_iterator(
                        iterator, current_strategy)
                break

            batch_logs.update(outputs)
            callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
            step_index = step_index + step_count
            current_step += 1

            if callbacks.model.stop_training:
                break

        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch)):
            logging.info('Running validation at fit epoch: %s', epoch)

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                distributed_training_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
                model,
                val_dataset,
                steps=validation_steps,
                verbose=verbose,
                callbacks=callbacks)
            if not isinstance(val_outs, list):
                val_outs = [val_outs]
            # Same labels assumed.
            for label, val_out in zip(out_labels, val_outs):
                epoch_logs['val_' + label] = val_out

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks._call_end_hook(mode)

    if model._compile_distribution:
        # Copy the weights back from the replicated model to the original model.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)
    scope.__exit__(None, None, None)
    return model.history
Beispiel #11
0
def experimental_tpu_predict_loop(model,
                                  dataset,
                                  verbose=0,
                                  steps=None,
                                  callbacks=None):
    """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
    mode = ModeKeys.PREDICT
    steps = training_utils.infer_steps_for_dataset(dataset,
                                                   steps,
                                                   steps_name='steps')
    dataset_fully_shaped = (
        distributed_training_utils.is_dataset_shape_fully_defined(dataset))
    padding_handler = None
    if not dataset_fully_shaped:
        # TODO(hongjunchoi): Investigate whether operations from
        # PartialBatchPaddingHandler are unnecessarily pruned out
        # during graph optimization.
        padding_handler = padding_util.PartialBatchPaddingHandler(
            model._feed_output_shapes)
        batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(
            dataset)
        padding_handler.padded_batch_size = batch_size
        padding_handler.padding_mask = dataset.reduce(
            padding_handler.padding_mask, padding_handler.update_mask)

        dataset = dataset.map(padding_handler.pad_batch)
        dataset = dataset.apply(batching.unbatch())
        # Upon this point, it is guaranteed that the dataset does not
        # have partial batches. Thus, we set `drop_remainder=True` to
        # get static shape information about the elements in the dataset.
        dataset = dataset.batch(batch_size, drop_remainder=True)

        if prefetch_buffer is not None:
            dataset = dataset.prefetch(prefetch_buffer)

    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    def _predict_step_fn(inputs):
        """A fn that returns output of single prediction step."""

        (distribution_strategy_context.get_replica_context().merge_call(
            _build_model, args=(model, mode, inputs)))

        (_, outputs, updates, _) = (_per_device_execution_function(
            distributed_training_utils.get_distributed_model(model, mode),
            mode))

        with ops.control_dependencies([updates]):
            return outputs

    # TODO(hongjunchoi): When numpy array is passed as an input to `predict()`
    # use numpy arrays directly to avoid cumulating unnecessary input pipeline
    # ops.
    predict_input_data = iterator.get_next()
    per_replica_outputs = current_strategy.experimental_run_v2(
        _predict_step_fn, args=(predict_input_data, ))
    output_tensors = distributed_training_utils.flatten_perdevice_values(
        current_strategy, per_replica_outputs)

    if verbose == 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    distributed_training_utils._reset_metrics(model)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=False,
                                         epochs=1,
                                         steps_per_epoch=steps,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=mode)
    callbacks._call_begin_hook(mode)

    # Since we do not know how many samples we will see, we cannot pre-allocate
    # the returned Numpy arrays. Instead, we store one array per batch seen
    # and concatenate them upon returning.
    num_model_outputs = len(model.output_names)
    unconcatenated_outs = [[] for _ in range(num_model_outputs)]
    if steps is not None:
        target_steps = steps
    else:
        target_steps = np.inf

    current_step = 0
    while current_step < target_steps:
        batch_logs = {'batch': current_step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
        try:
            predict_ops = control_flow_ops.group(output_tensors)
            _, batch_outs = K.batch_get_value([predict_ops, output_tensors])

        except errors.OutOfRangeError:
            if steps is not None:
                warning_msg = 'Make sure that your dataset can generate at least '
                '`steps` batches (in this case, {} batches).'.format(steps)
            else:
                warning_msg = 'Number of steps ran: {} steps'.format(
                    current_step)

            logging.warning('Your dataset iterator ran out of data; '
                            'interrupting evaluation. ' + warning_msg)
            break

        # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
        for i in range(num_model_outputs):
            output_start_index = i * current_strategy.num_replicas_in_sync
            output_end_index = (output_start_index +
                                current_strategy.num_replicas_in_sync)
            single_model_output = batch_outs[
                output_start_index:output_end_index]
            unconcatenated_outs[i].extend(single_model_output)

        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
        if verbose >= 1:
            progbar.update(current_step + 1)
        current_step += 1

    callbacks._call_end_hook(mode)

    scope.__exit__(None, None, None)

    if len(unconcatenated_outs) == 1:
        prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
    else:
        prediction_result = [
            np.concatenate(unconcatenated_outs[i], axis=0)
            for i in range(len(unconcatenated_outs))
        ]

    if padding_handler:
        prediction_result = padding_handler.apply_mask(prediction_result)

    return prediction_result
Beispiel #12
0
def model_iteration(model,
                    inputs,
                    targets=None,
                    sample_weights=None,
                    batch_size=None,
                    epochs=1,
                    verbose=1,
                    callbacks=None,
                    val_inputs=None,
                    val_targets=None,
                    val_sample_weights=None,
                    shuffle=True,
                    initial_epoch=0,
                    steps_per_epoch=None,
                    validation_steps=None,
                    mode=ModeKeys.TRAIN,
                    validation_in_fit=False,
                    **kwargs):
  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.

  Arguments:
      model: Keras Model instance.
      inputs: Either a list of arrays or a dictionary.
      targets: List of target arrays.
      sample_weights: Optional list of sample weight arrays.
      batch_size: Integer batch size or None if unknown.
      epochs: Number of times to iterate over the data
      verbose: Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      val_inputs: List of input arrays.
      val_targets: List of target arrays.
      val_sample_weights: Optional list of sample weight arrays.
      shuffle: Whether to shuffle the data at the beginning of each epoch
        concatenation of list the display names of the outputs of `f` and the
        list of display names of the outputs of `f_val`.
      initial_epoch: Epoch at which to start training (useful for resuming a
        previous training run)
      steps_per_epoch: Total number of steps (batches of samples) before
        declaring one epoch finished and starting the next epoch. Ignored with
        the default value of `None`.
      validation_steps: Number of steps to run validation for (only if doing
        validation from data tensors). Ignored with the default value of `None`.
      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
      validation_in_fit: DEPRECATED: if true, then this method is invoked from
        within training iteration (for validation). In this case, do not copy
        weights when using a tf.distribute.Strategy. The input is deprecated as
        it is not required if the user creates a distributed model under the
        distribution strategy scope rather than passing it to compile.
      **kwargs: Additional arguments for backwards compatibility.

  Returns:
      - In TRAIN mode: `History` object.
      - In TEST mode: Evaluation metrics.
      - In PREDICT mode: Outputs of the Model called on inputs.

  Raises:
      ValueError: in case of invalid arguments.
  """
  # Backwards compatibility.
  if 'steps' in kwargs:
    steps_per_epoch = kwargs['steps']

  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
  if mode == ModeKeys.TRAIN:
    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)

  # Enter DistributionStrategy scope.
  if model._distribution_strategy:
    scope = model._distribution_strategy.scope()
    scope.__enter__()

  # Get step function and loop type.
  f = _make_execution_function(model, mode)
  use_steps = steps_per_epoch is not None
  do_validation = val_inputs is not None

  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
  inputs, targets, sample_weights = training_utils. \
      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))

  # Prepare input data.
  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                   steps_per_epoch)

  # Configure callbacks.
  count_mode = 'steps' if use_steps else 'samples'
  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      batch_size=batch_size,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      samples=num_samples_or_steps,
      verbose=0,  # Handle ProgBarLogger separately in this loop.
      mode=mode)
  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
  progbar = training_utils.get_progbar(model, count_mode)
  progbar.params = callbacks.params
  progbar.params['verbose'] = verbose

  # Find beforehand arrays that need sparse-to-dense conversion.
  if issparse is not None and not use_steps:
    indices_for_conversion_to_dense = []
    feed = _get_model_feed(model, mode)
    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
      if issparse(input_data) and not K.is_sparse(feed_tensor):
        indices_for_conversion_to_dense.append(i)

  # Select aggregation method.
  if mode == ModeKeys.PREDICT:
    aggregator = training_utils.OutputsAggregator(use_steps,
                                                  num_samples_or_steps)
  else:
    aggregator = training_utils.MetricsAggregator(use_steps,
                                                  num_samples_or_steps)

  if model._compile_distribution and not validation_in_fit:
    distributed_training_utils._copy_weights_to_distributed_model(
        model, model._distributed_model)

  callbacks.model.stop_training = False
  callbacks._call_begin_hook(mode)
  progbar.on_train_begin()

  for epoch in range(initial_epoch, epochs):
    if callbacks.model.stop_training:
      break

    # Setup work for each epoch
    epoch_logs = {}
    model.reset_metrics()
    if mode == ModeKeys.TRAIN:
      callbacks.on_epoch_begin(epoch, epoch_logs)
    progbar.on_epoch_begin(epoch, epoch_logs)

    if use_steps:
      # Step-wise loop.
      for step in range(steps_per_epoch):
        batch_logs = {'batch': step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
        progbar.on_batch_begin(step, batch_logs)

        # Get outputs.
        try:
          # `ins` can be callable in DistributionStrategy + eager case.
          actual_inputs = ins() if callable(ins) else ins
          batch_outs = f(actual_inputs)
        except errors.OutOfRangeError:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches). You may need to'
                          'use the repeat() function when building your '
                          'dataset.' % steps_per_epoch * epochs)
          break
        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        if model._distribution_strategy:
          batch_outs = distributed_training_utils._per_device_aggregate_batch(
              batch_outs, model, mode)

        # Aggregate results.
        if step == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
        progbar.on_batch_end(step, batch_logs)

        if callbacks.model.stop_training:
          break
    else:
      # Sample-wise loop.
      index_array = np.arange(num_samples_or_steps)
      if shuffle == 'batch':
        index_array = training_utils.batch_shuffle(index_array, batch_size)
      elif shuffle:
        np.random.shuffle(index_array)
      batches = make_batches(num_samples_or_steps, batch_size)

      for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]

        # Slice into a batch.
        try:
          if ins and isinstance(ins[-1], int):
            # Do not slice the training phase flag.
            ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
          else:
            ins_batch = slice_arrays(ins, batch_ids)
        except TypeError:
          raise TypeError('TypeError while preparing batch. '
                          'If using HDF5 input data, '
                          'pass shuffle="batch".')

        # Sparse to dense conversion.
        if issparse is not None:
          for i in indices_for_conversion_to_dense:
            ins_batch[i] = ins_batch[i].toarray()

        # Callbacks batch_begin.
        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
        progbar.on_batch_begin(batch_index, batch_logs)

        # Get outputs.
        batch_outs = f(ins_batch)
        if not isinstance(batch_outs, list):
          batch_outs = [batch_outs]

        # Aggregate results.
        if batch_index == 0:
          aggregator.create(batch_outs)
        aggregator.aggregate(batch_outs, batch_start, batch_end)

        # Callbacks batch end.
        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
        progbar.on_batch_end(batch_index, batch_logs)

        if callbacks.model.stop_training:
          break

    aggregator.finalize()
    results = aggregator.results
    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
    if len(results) == 1:
      results = results[0]

    # Run the test loop every epoch during training.
    if do_validation and not callbacks.model.stop_training:
      val_results = model_iteration(
          model,
          val_inputs,
          targets=val_targets,
          sample_weights=val_sample_weights,
          batch_size=batch_size,
          steps_per_epoch=validation_steps,
          callbacks=callbacks,
          verbose=0,
          mode=ModeKeys.TEST,
          validation_in_fit=True)
      if not isinstance(val_results, list):
        val_results = [val_results]
      epoch_logs = cbks.make_logs(
          model, epoch_logs, val_results, mode, prefix='val_')

    if mode == ModeKeys.TRAIN:
      # Epochs only apply to `fit`.
      callbacks.on_epoch_end(epoch, epoch_logs)
      progbar.on_epoch_end(epoch, epoch_logs)

  callbacks._call_end_hook(mode)

  if model._distribution_strategy:
    if model._compile_distribution and not validation_in_fit:
      # TODO(priyag, psv): Copy back metrics to the original model as well?
      distributed_training_utils._copy_weights_to_original_model(
          model, model._distributed_model, mode)
    scope.__exit__(None, None, None)

  if mode == ModeKeys.TRAIN:
    return model.history
  return results
Beispiel #13
0
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
    """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
    mode = ModeKeys.TEST
    current_strategy = model._distribution_strategy
    iterator = distributed_training_utils.get_iterator(dataset,
                                                       current_strategy)
    steps = training_utils.infer_steps_for_dataset(dataset,
                                                   steps,
                                                   steps_name='steps')

    scope = distributed_training_utils.distributed_scope(
        strategy=current_strategy, learning_phase=0)
    scope.__enter__()

    out_labels = model.metrics_names

    def _test_step_fn(inputs):
        """A fn that returns output of single test step."""
        inputs, targets = inputs
        (distribution_strategy_context.get_replica_context().merge_call(
            _build_model, args=(model, mode, inputs, targets)))

        (_, outputs, updates, _) = (_per_device_execution_function(
            distributed_training_utils.get_distributed_model(model, mode),
            mode))
        with ops.control_dependencies([updates]):
            return outputs

    test_input_data = iterator.get_next()
    per_replica_outputs = current_strategy.experimental_run_v2(
        _test_step_fn, args=(test_input_data, ))
    output_tensors = {}
    for label, output in zip(out_labels, per_replica_outputs):
        if label == 'loss':
            reduce_op = ds_reduce_util.ReduceOp.SUM
        else:
            # We reduce all other metrics using mean for now. This is temporary
            # workaround until new metrics are in place.
            reduce_op = ds_reduce_util.ReduceOp.MEAN
        output_tensors[label] = current_strategy.reduce(reduce_op, output)
    test_op = control_flow_ops.group(list(output_tensors.values()))

    if verbose >= 1:
        progbar = Progbar(target=steps)

    if model._compile_distribution:
        distributed_training_utils._copy_weights_to_distributed_model(
            model, mode)

    distributed_training_utils._reset_metrics(model)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=False,
                                         epochs=1,
                                         steps_per_epoch=steps,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=ModeKeys.TEST)
    callbacks._call_begin_hook(mode)

    outs = [0.] * len(model.metrics_names)
    if steps is not None:
        target_steps = steps
    else:
        raise ValueError('Number of steps could not be infered from the data, '
                         'please pass the steps argument.')

    current_step = 0
    while current_step < target_steps:
        batch_logs = {'batch': current_step, 'size': 1}
        callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
        try:
            _, batch_outs = K.batch_get_value([test_op, output_tensors])
        except errors.OutOfRangeError:
            warning_msg = 'Make sure that your dataset can generate at least '
            '`steps` batches (in this case, {} batches).'.format(steps)

            logging.warning('Your dataset iterator ran out of data; '
                            'interrupting evaluation. ' + warning_msg)
            target_steps = current_step
            break
        for i, label in enumerate(model.metrics_names):
            if i == 0:
                # Loss is stateless metrics.
                outs[i] += batch_outs[label]
            else:
                # For all stateful metrics, the aggregation is handled by mirrored vars.
                outs[i] = batch_outs[label]

        batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
        callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
        if verbose == 1:
            progbar.update(current_step + 1)
        current_step += 1

    if verbose >= 1:
        # Progress bar finishes at the end.
        progbar.update(target_steps)
    callbacks._call_end_hook(mode)

    scope.__exit__(None, None, None)
    if len(outs) >= 0:
        outs[0] /= (target_steps)

    if len(outs) == 1:
        return outs[0]
    return outs
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  mode = ModeKeys.TEST
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_eval_function(model):
    model._make_eval_function()
    return (model._eval_function.inputs, model._eval_function.outputs,
            model._eval_function.updates_op,
            model._eval_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_eval_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode=mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_eval_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TEST),))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_test_function',
        **all_session_args)

    for label, output in zip(model.metrics_names, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  # TODO(priyag): Use steps_per_run when we use new metrics as they will
  # allow handling metric computation at each step using variables.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=ModeKeys.TEST)
  callbacks._call_begin_hook(mode)

  assert steps is not None
  outs = [0.] * len(model.metrics_names)
  for step in range(steps):
    batch_logs = {'batch': step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
    _, batch_outs = K.get_session().run([test_op, output_tensors])
    for i, label in enumerate(model.metrics_names):
      if i == 0:
        # Loss is stateless metrics.
        outs[i] += batch_outs[label]
      else:
        # For all stateful metrics, the aggregation is handled by mirrored vars.
        outs[i] = batch_outs[label]

    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
    if verbose >= 1:
      progbar.update(step + 1)

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)
  if len(outs) >= 0:
    outs[0] /= (steps)

  if len(outs) == 1:
    return outs[0]
  return outs
def experimental_tpu_predict_loop(model, iterator, verbose=0, steps=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  current_strategy = model._distribution_strategy
  scope = current_strategy.scope()
  scope.__enter__()

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      distributed_training_utils. clone_model_on_replicas(
          model, current_strategy,
          make_callback_model=False, inputs=inputs,
          mode=distributed_training_utils.ModeKeys.PREDICT)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, inputs,
          mode=distributed_training_utils.ModeKeys.PREDICT)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function, args=(model._distributed_model_predict,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn, iterator, iterations=1,
        initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    with current_strategy.scope():
      distributed_training_utils._copy_weights_to_distributed_model(
          model, model._distributed_model_predict)
  with current_strategy.scope():
    distributed_training_utils._reset_metrics(
        model, model._distributed_model_predict)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    if verbose >= 1:
      progbar.update(step + 1)

  scope.__exit__(None, None, None)
  if len(unconcatenated_outs) == 1:
    return np.concatenate(unconcatenated_outs[0], axis=0)
  return [
      np.concatenate(unconcatenated_outs[i], axis=0)
      for i in range(len(unconcatenated_outs))
  ]
def experimental_tpu_predict_loop(model,
                                  dataset,
                                  verbose=0,
                                  steps=None,
                                  callbacks=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  mode = ModeKeys.PREDICT
  dataset_fully_shaped = (distributed_training_utils.
                          is_dataset_shape_fully_defined(dataset))
  padding_handler = None
  if not dataset_fully_shaped:
    # TODO(hongjunchoi): Investigate whether operations from
    # PartialBatchPaddingHandler are unnecessarily pruned out
    # during graph optimization.
    padding_handler = padding_util.PartialBatchPaddingHandler(
        model._feed_output_shapes)
    batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
    padding_handler.padded_batch_size = batch_size
    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
                                                  padding_handler.update_mask)

    dataset = dataset.map(padding_handler.pad_batch)
    dataset = dataset.apply(batching.unbatch())
    # Upon this point, it is guaranteed that the dataset does not
    # have partial batches. Thus, we set `drop_remainder=True` to
    # get static shape information about the elements in the dataset.
    dataset = dataset.batch(batch_size, drop_remainder=True)

    if prefetch_buffer is not None:
      dataset = dataset.prefetch(prefetch_buffer)

  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.PREDICT),))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=mode)
  callbacks._call_begin_hook(mode)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    batch_logs = {'batch': step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
    if verbose >= 1:
      progbar.update(step + 1)

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)

  if len(unconcatenated_outs) == 1:
    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
  else:
    prediction_result = [
        np.concatenate(unconcatenated_outs[i], axis=0)
        for i in range(len(unconcatenated_outs))
    ]

  if padding_handler:
    prediction_result = padding_handler.apply_mask(prediction_result)

  return prediction_result
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
  if (current_strategy.extended.steps_per_run != 1 and
      steps_per_epoch is None):
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model with TPUStrategy when '
                     '`steps_per_run` != 1 .')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  use_steps = steps_per_epoch is not None
  if use_steps:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    iteration_value = current_strategy.extended.steps_per_run

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  if use_steps:
    steps_to_run = ([current_strategy.extended.steps_per_run] *
                    (steps_per_epoch //
                     current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
      steps_to_run.append(
          steps_per_epoch % current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)
  else:
    target_steps = np.inf

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step] if use_steps else 1
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        if use_steps:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches).' %
                          steps_per_epoch * epochs)
        else:
          target_steps = current_step
          logging.info('Dataset iterator ran out of data. Inferring the '
                       'value of `steps_per_epoch` as %s  .' % target_steps)
          distributed_training_utils.initialize_iterator(iterator,
                                                         current_strategy)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history