Exemple #1
0
  def __init__(self,
               container_strategy,
               tpu_cluster_resolver=None,
               steps_per_run=None,
               device_assignment=None):
    super(TPUExtended, self).__init__(container_strategy)

    if tpu_cluster_resolver is None:
      tpu_cluster_resolver = TPUClusterResolver("")

    if steps_per_run is None:
      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
      # not specified.
      steps_per_run = 1

    self._tpu_function_cache = weakref.WeakKeyDictionary()
    self._tpu_cluster_resolver = tpu_cluster_resolver
    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
    self._device_assignment = device_assignment

    # Device assignment is currently only supported for 1 core case.
    if self._device_assignment:
      assert isinstance(self._device_assignment,
                        device_assignment_lib.DeviceAssignment)
      if self._device_assignment.num_replicas != 1:
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")
      if self._device_assignment.num_cores_per_replica != 1:
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")
      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")

    # TODO(jhseu): Switch to DeviceAssignment to support pods and model
    # parallelism.
    self._tpu_devices = [d.name for d in self._tpu_metadata.devices
                         if "device:TPU:" in d.name]

    self._host_device = device_util.get_host_for_device(self._tpu_devices[0])

    # Only create variables for the number of replicas we're running.
    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
    self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

    # Preload the data onto the TPUs.
    input_worker_devices = collections.OrderedDict()
    for tpu_device in self._tpu_devices:
      host_device = device_util.get_host_for_device(tpu_device)
      input_worker_devices.setdefault(host_device, [])
      input_worker_devices[host_device].append(tpu_device)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, tuple(input_worker_devices.items()))

    # TODO(sourabhbajaj): Remove this once performance of running one step
    # at a time is comparable to multiple steps.
    self.steps_per_run = steps_per_run
    self._require_static_shapes = True

    self.experimental_enable_get_next_as_optional = True
    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_function_cache = weakref.WeakKeyDictionary()
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        self._tpu_devices = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # Only create variables for the number of replicas we're running.
        if device_assignment is not None:
            job_name = device_spec.DeviceSpecV2.from_string(
                self._tpu_devices[0]).job

            self._tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                tpu_device = device_assignment.tpu_device(replica=replica_id,
                                                          logical_core=0,
                                                          job=job_name)
                tpu_device = device_util.canonicalize(tpu_device)
                self._tpu_devices.append(tpu_device)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0])

        self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

        # Preload the data onto the TPUs.
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, tuple(input_worker_devices.items()))

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        self.experimental_enable_get_next_as_optional = True
        self.experimental_enable_dynamic_batch_size = True
Exemple #3
0
 def _input_workers_with_options(self, options=None):
     host_device = device_util.get_host_for_device(self._worker_device)
     if not options or options.experimental_prefetch_to_device:
         return input_lib.InputWorkers([(host_device, self.worker_devices)])
     else:
         return input_lib.InputWorkers([(host_device, [
             device_util.get_host_for_device(worker)
             for worker in self.worker_devices
         ])])
  def __init__(self,
               container_strategy,
               tpu_cluster_resolver=None,
               steps_per_run=None,
               device_assignment=None):
    super(TPUExtended, self).__init__(container_strategy)

    if tpu_cluster_resolver is None:
      tpu_cluster_resolver = TPUClusterResolver("")

    if steps_per_run is None:
      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
      # not specified.
      steps_per_run = 1

    self._tpu_cluster_resolver = tpu_cluster_resolver
    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
    self._device_assignment = device_assignment

    # Device assignment is currently only supported for 1 core case.
    if self._device_assignment:
      assert isinstance(self._device_assignment,
                        device_assignment_lib.DeviceAssignment)
      if self._device_assignment.num_replicas != 1:
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")
      if self._device_assignment.num_cores_per_replica != 1:
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")
      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
        raise ValueError("Device assignment is only supported for a single "
                         "core single replica case currently.")

    # TODO(jhseu): Switch to DeviceAssignment to support pods and model
    # parallelism.
    self._tpu_devices = [d.name for d in self._tpu_metadata.devices
                         if "device:TPU:" in d.name]

    self._host_device = device_util.get_host_for_device(self._tpu_devices[0])

    # Only create variables for the number of replicas we're running.
    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
    self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

    # Preload the data onto the TPUs.
    input_worker_devices = collections.OrderedDict()
    for tpu_device in self._tpu_devices:
      host_device = device_util.get_host_for_device(tpu_device)
      input_worker_devices.setdefault(host_device, [])
      input_worker_devices[host_device].append(tpu_device)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, tuple(input_worker_devices.items()))

    # TODO(sourabhbajaj): Remove this once performance of running one step
    # at a time is comparable to multiple steps.
    self.steps_per_run = steps_per_run
    self._require_static_shapes = True
Exemple #5
0
  def testTPU(self, input_type, api_type, iteration_type, distribution,
              enable_get_next_as_optional):
    worker_device_pairs = collections.OrderedDict()
    for tpu_device in distribution.extended._tpu_devices:
      host_device = device_util.get_host_for_device(tpu_device)
      worker_device_pairs.setdefault(host_device, [])
      worker_device_pairs[host_device].append(tpu_device)
    worker_device_pairs = worker_device_pairs.items()
    if tf2.enabled():
      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
    else:
      dataset_fn = lambda _: dataset_ops.Dataset.range(10)
    dataset_or_input_fn = self._create_dataset_or_input_fn(
        input_type, dataset_fn)

    expected_values = [[i, i + 1] for i in range(0, 10, 2)]

    distribution.extended.experimental_enable_get_next_as_optional = (
        enable_get_next_as_optional)
    self._test_input_iteration(
        input_type,
        api_type,
        iteration_type,
        dataset_or_input_fn,
        worker_device_pairs,
        expected_values,
        distribution)
Exemple #6
0
 def _set_prefetch_on_host(self, value):
   if self._prefetch_on_host == value:
     return
   if self._input_workers_obj is not None:
     raise RuntimeError("Unable to change prefetch on host behavior as "
                        "InputWorkers are already created.")
   self._prefetch_on_host = value
   if value:
     # To prefetch on the host, we must set all the input worker devices to the
     # corresponding host devices.
     self._input_worker_devices = tuple([
         tuple([host,
                [device_util.get_host_for_device(d) for d in devices]])
         for host, devices in self._input_worker_devices])
     # Force creation of the workers.
     workers = self._input_workers
     del workers
  def testTPU(self, input_type, api_type, iteration_type, distribution,
              enable_get_next_as_optional):
    worker_device_pairs = collections.OrderedDict()
    for tpu_device in distribution.extended._tpu_devices:
      host_device = device_util.get_host_for_device(tpu_device)
      worker_device_pairs.setdefault(host_device, [])
      worker_device_pairs[host_device].append(tpu_device)
    worker_device_pairs = worker_device_pairs.items()
    if tf2.enabled():
      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
    else:
      dataset_fn = lambda _: dataset_ops.Dataset.range(10)

    expected_values = [[i, i + 1] for i in range(0, 10, 2)]

    self._test_input_iteration(
        input_type,
        api_type,
        iteration_type,
        dataset_fn,
        worker_device_pairs,
        expected_values,
        distribution,
        enable_get_next_as_optional=enable_get_next_as_optional)
Exemple #8
0
    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_function_cache = weakref.WeakKeyDictionary()
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        self._device_assignment = device_assignment

        tpu_devices_flat = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is
        # indexed using `[replica_id][logical_device_id]`.
        if device_assignment is None:
            self._tpu_devices = np.array([[d] for d in tpu_devices_flat],
                                         dtype=object)
        else:
            job_name = device_spec.DeviceSpecV2.from_string(
                tpu_devices_flat[0]).job

            tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                replica_devices = []

                for logical_core in range(
                        device_assignment.num_cores_per_replica):
                    replica_devices.append(
                        device_util.canonicalize(
                            device_assignment.tpu_device(
                                replica=replica_id,
                                logical_core=logical_core,
                                job=job_name)))

                tpu_devices.append(replica_devices)
            self._tpu_devices = np.array(tpu_devices, dtype=object)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0][0])

        # Preload the data onto the TPUs. Currently we always preload onto logical
        # device 0 for each replica.
        # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the
        # input onto a different logical device?
        input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices[:, 0]:
            host_device = device_util.get_host_for_device(tpu_device)
            input_worker_devices.setdefault(host_device, [])
            input_worker_devices[host_device].append(tpu_device)
        self._input_worker_devices = tuple(input_worker_devices.items())
        self._input_workers_obj = None

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't
        # need to retrace functions for each device.
        self._retrace_functions_for_each_device = False

        self.experimental_enable_get_next_as_optional = True
        self.experimental_enable_dynamic_batch_size = True
        self._prefetch_on_host = False

        self._logical_device_stack = [0]
Exemple #9
0
 def __init__(self, container_strategy, device):
     super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device_util.resolve(device)
     self._input_device = device_util.get_host_for_device(self._device)
Exemple #10
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = config_pb2.ConfigProto()
            config_proto = self._update_config_proto(config_proto)

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys,
            communication=self._communication)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)
        host_device = device_util.get_host_for_device(self._worker_device)
        self._input_workers = input_lib.InputWorkers([(host_device,
                                                       self.worker_devices)])

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 device_assignment=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        # `self._tpu_function_cache` is a dict of `tf.function`s, thus if a
        # `tf.function` is passed into `strategy.run` in eager mode, the
        # `tf.function` won't get retraced.
        self._tpu_function_cache = weakref.WeakKeyDictionary()

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata(
        )
        self._device_assignment = device_assignment

        tpu_devices_flat = [
            d.name for d in self._tpu_metadata.devices
            if "device:TPU:" in d.name
        ]

        # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is
        # indexed using `[replica_id][logical_device_id]`.
        if device_assignment is None:
            self._tpu_devices = np.array([[d] for d in tpu_devices_flat],
                                         dtype=object)
        else:
            job_name = device_spec.DeviceSpecV2.from_string(
                tpu_devices_flat[0]).job

            tpu_devices = []
            for replica_id in range(device_assignment.num_replicas):
                replica_devices = []

                for logical_core in range(
                        device_assignment.num_cores_per_replica):
                    replica_devices.append(
                        device_util.canonicalize(
                            device_assignment.tpu_device(
                                replica=replica_id,
                                logical_core=logical_core,
                                job=job_name)))

                tpu_devices.append(replica_devices)
            self._tpu_devices = np.array(tpu_devices, dtype=object)

        self._host_device = device_util.get_host_for_device(
            self._tpu_devices[0][0])

        # Preload the data onto the TPUs. Currently we always preload onto logical
        # device 0 for each replica.
        # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the
        # input onto a different logical device?
        self._device_input_worker_devices = collections.OrderedDict()
        self._host_input_worker_devices = collections.OrderedDict()
        for tpu_device in self._tpu_devices[:, 0]:
            host_device = device_util.get_host_for_device(tpu_device)
            self._device_input_worker_devices.setdefault(host_device, [])
            self._device_input_worker_devices[host_device].append(tpu_device)
            self._host_input_worker_devices.setdefault(host_device, [])
            self._host_input_worker_devices[host_device].append(host_device)

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        self.experimental_enable_get_next_as_optional = True

        self._logical_device_stack = [0]

        if context.executing_eagerly():
            # In async remote eager, we want to sync the executors before exiting the
            # program.
            def async_wait():
                if context.context()._context_handle is not None:  # pylint: disable=protected-access
                    context.async_wait()

            atexit.register(async_wait)

        # Flag to turn on VariablePolicy
        self._use_var_policy = False
Exemple #12
0
def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning("TPU system %s has already been initialized. "
                    "Reinitializing the TPU can cause previously created "
                    "variables on TPU to be lost.")

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    # The easiest way to trigger a rewrite is to run the function with
    # TPUPartitionedCallOp.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
    # see above) but need to define it to get it added to eager context
    # and get its assigned name.
    # pylint: disable=protected-access
    graph_func = _tpu_init_fn._get_concrete_function_internal()
    func_name = compat.as_str(graph_func._inference_function.name)
    # pylint: enable=protected-access

    tpu_devices = sorted(
        [x for x in context.list_devices() if "device:TPU:" in x])

    if not tpu_devices:
      raise RuntimeError("Could not find any TPU devices")

    with ops.device(device_util.get_host_for_device(tpu_devices[0])):
      output = tpu_functional_ops.TPUPartitionedCall(
          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
    serialized_topology = output[0].numpy()
  else:
    master = cluster_resolver.master()
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology