Ejemplo n.º 1
0
def filter_distributed_callbacks(callbacks_list):
  """Filter Callbacks based on the worker context when running multi-worker.

  Arguments:
    callbacks_list: A list of `Callback` instances.

  Returns:
    The list of `Callback` instances that should be run on this worker.
  """

  if not multi_worker_util.in_multi_worker_mode():
    raise ValueError(
        'filter_distributed_callbacks() should only be called when Keras '
        'is in multi worker mode.')

  callbacks_list = callbacks_list or []
  if not [
      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
  ]:
    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
    # fails to (possibly with tempfile directory).
    logging.warning('ModelCheckpoint callback is not provided. '
                    'Workers will need to restart training if any fails.')

  if callbacks_list is None or is_current_worker_chief():
    return callbacks_list

  # Some Callbacks should only run on the chief worker.
  return [
      callback for callback in callbacks_list if not callback._chief_worker_only
  ]  # pylint: disable=protected-access
 def _assert_in_multi_worker_mode(self):
     if not multi_worker_util.in_multi_worker_mode():
         raise ValueError(
             'MultiWorkerTrainingState is only supposed to be used '
             'in multi-worker training. This indicates some error '
             'that needs to be fixed. Please submit a bug issue to '
             'tf.keras team.')