Example #1
0
  def _reduce_to(self, reduce_op, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if reduce_op == reduce_util.ReduceOp.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self._num_replicas_in_sync)
      elif reduce_op != reduce_util.ReduceOp.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    if not isinstance(value, values.DistributedValues):
      # This function handles reducing values that are not PerReplica or
      # Mirrored values. For example, the same value could be present on all
      # replicas in which case `value` would be a single value or value could
      # be 0.
      return cross_device_ops_lib.reduce_non_distributed_value(
          reduce_op, self._device_map, value, destinations)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_device_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host_device)
    else:
      raise ValueError("Multiple devices are not supported for TPUStrategy")

    output = math_ops.add_n(value)
    if reduce_op == reduce_util.ReduceOp.MEAN:
      return output * (1. / len(value))
    return output
Example #2
0
  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
    """Initialize an `InputWorkers` object.

    Args:
      device_map: A `DeviceMap` with the computation devices fed by the
        input workers.
      worker_device_pairs: A sequence of pairs:
        `(input device, a tuple of compute devices fed by that input device)`.
      logical_device: The logical device of `device_map` to feed.
    """
    self._device_map = device_map
    self._logical_device = logical_device
    if worker_device_pairs is None:
      worker_device_pairs = ((
          device_util.canonicalize("/device:CPU:0"),
          device_map.logical_to_actual_devices(logical_device)),)
    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
                              for _, f in worker_device_pairs)
    flattened = tuple(d for l in self._fed_devices for d in l)
    assert (flattened ==
            device_map.logical_to_actual_devices(logical_device)), (
                "flattened: %s logical device %d: %s" %
                (flattened, logical_device,
                 device_map.logical_to_actual_devices(logical_device)))
Example #3
0
 def testCanonicalizeWithDefaultDevice(self):
   self.assertEqual(
       device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"),
       "/job:worker/replica:0/task:1/device:CPU:0")
   self.assertEqual(
       device_util.canonicalize("/job:worker/task:1", default="/gpu:0"),
       "/job:worker/replica:0/task:1/device:GPU:0")
   self.assertEqual(
       device_util.canonicalize("/cpu:0", default="/job:worker"),
       "/job:worker/replica:0/task:0/device:CPU:0")
  def _initialize_local(self, num_gpus_per_worker):
    """Initialize internal devices for local training."""
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus_per_worker > 0:
      self._compute_devices = list(
          map("/device:GPU:{}".format, range(num_gpus_per_worker)))
    else:
      self._compute_devices = [_LOCAL_CPU]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # If there is only one GPU, put everything on that GPU. Otherwise, place
    # variables on CPU.
    if num_gpus_per_worker == 1:
      assert len(list(self._compute_devices)) == 1
      self._variable_device = _LOCAL_GPU_0
      self._parameter_devices = [_LOCAL_GPU_0]
    else:
      self._variable_device = _LOCAL_CPU
      self._parameter_devices = [_LOCAL_CPU]

    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info(
        "ParameterServerStrategy with compute_devices = %r, "
        "variable_device = %r", self._compute_devices, self._variable_device)
  def _initialize_local_worker(self, num_gpus_per_worker):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    if num_gpus_per_worker:
      local_devices = [
          "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
      ]
    else:
      local_devices = ["/device:CPU:0"]
    self._worker_device = device_util.canonicalize("/device:CPU:0")

    self._collective_keys = cross_device_utils.CollectiveKeys()
    self._initialize_local(local_devices)
    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus_per_worker,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                 local_devices)
  def _initialize_local_worker(self, num_gpus_per_worker):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    if num_gpus_per_worker:
      local_devices = tuple(
          "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
      )
    else:
      local_devices = ("/device:CPU:0",)
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    self._initialize_local(local_devices)
    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus_per_worker,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                 local_devices)
  def _initialize_local(self, cluster_resolver):
    """Initialize internal devices for local training."""
    worker_device = device_util.canonicalize("/device:CPU:0")
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
    num_gpus = cluster_resolver.num_accelerators()
    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus > 0:
      compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
    else:
      compute_devices = (_LOCAL_CPU,)

    self._device_map = values.ReplicaDeviceMap(compute_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(worker_device, compute_devices)])

    # If there is only one GPU, put everything on that GPU. Otherwise, place
    # variables on CPU.
    if num_gpus == 1:
      assert len(compute_devices) == 1
      self._variable_device = _LOCAL_GPU_0
      self._parameter_devices = (_LOCAL_GPU_0,)
    else:
      self._variable_device = _LOCAL_CPU
      self._parameter_devices = (_LOCAL_CPU,)

    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info(
        "ParameterServerStrategy with compute_devices = %r, "
        "variable_device = %r", compute_devices, self._variable_device)
 def __init__(self, container_strategy, device):
   super(OneDeviceExtended, self).__init__(container_strategy)
   self._device = device
   self._input_device = device_util.canonicalize("/device:CPU:0")
   worker_device_pairs = [(self._input_device, [self._device])]
   device_map = values.SingleDeviceMap(device)
   self._input_workers = input_lib.InputWorkers(
       device_map, worker_device_pairs)
Example #9
0
 def _device_scope(self):
     if (self._packed_handle is None
             or values_util.is_saving_non_distributed()
             or tpu_util.enclosing_tpu_context() is not None):
         return ops.NullContextmanager()
     device = device_util.canonicalize(device_util.current())
     if device in self._device_to_handle:
         return ops.NullContextmanager()
     return ops.device(self._primary_handle.device)
Example #10
0
    def _make_dataset_iterator(self, dataset):
        if self._local_mode:
            worker = device_util.canonicalize("/device:CPU:0")
            worker_device_pairs = [(worker, self._devices)]
        else:
            worker_device_pairs = self._worker_devices

        return values.DatasetIterator(dataset, worker_device_pairs,
                                      self._num_replicas_in_sync)
Example #11
0
 def __init__(self, container_strategy, device):
   super(OneDeviceExtended, self).__init__(container_strategy)
   self._device = device
   self._default_device = device
   self._input_device = device_util.canonicalize("/device:CPU:0")
   worker_device_pairs = [(self._input_device, [self._device])]
   device_map = values.SingleDeviceMap(device)
   self._input_workers = input_lib.InputWorkers(
       device_map, worker_device_pairs)
Example #12
0
  def __init__(self,
               container_strategy,
               tpu_cluster_resolver=None,
               steps_per_run=None,
               device_assignment=None):
    super(TPUExtended, self).__init__(container_strategy)

    if tpu_cluster_resolver is None:
      tpu_cluster_resolver = TPUClusterResolver("")

    if steps_per_run is None:
      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
      # not specified.
      steps_per_run = 1

    self._tpu_function_cache = weakref.WeakKeyDictionary()
    self._tpu_cluster_resolver = tpu_cluster_resolver
    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
    self._device_assignment = device_assignment

    self._tpu_devices = [d.name for d in self._tpu_metadata.devices
                         if "device:TPU:" in d.name]

    # Only create variables for the number of replicas we're running.
    if device_assignment is not None:
      job_name = device_spec.DeviceSpecV2.from_string(self._tpu_devices[0]).job

      self._tpu_devices = []
      for replica_id in range(device_assignment.num_replicas):
        tpu_device = device_assignment.tpu_device(
            replica=replica_id, logical_core=0, job=job_name)
        tpu_device = device_util.canonicalize(tpu_device)
        self._tpu_devices.append(tpu_device)

    self._host_device = device_util.get_host_for_device(self._tpu_devices[0])

    # Preload the data onto the TPUs.
    input_worker_devices = collections.OrderedDict()
    for tpu_device in self._tpu_devices:
      host_device = device_util.get_host_for_device(tpu_device)
      input_worker_devices.setdefault(host_device, [])
      input_worker_devices[host_device].append(tpu_device)
    self._input_worker_devices = tuple(input_worker_devices.items())
    self._input_workers_obj = None

    # TODO(sourabhbajaj): Remove this once performance of running one step
    # at a time is comparable to multiple steps.
    self.steps_per_run = steps_per_run
    self._require_static_shapes = True

    # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't
    # need to retrace functions for each device.
    self._retrace_functions_for_each_device = False

    self.experimental_enable_get_next_as_optional = True
    self.experimental_enable_dynamic_batch_size = True
    self._prefetch_on_host = False
Example #13
0
 def _initialize_single_worker(self, devices):
     """Initializes the object for single-worker training."""
     self._devices = tuple(device_util.canonicalize(d) for d in devices)
     self._input_workers = input_lib.InputWorkers(
         ((device_util.canonicalize("/device:CPU:0",
                                    devices[0]), devices), ))
     self._inferred_cross_device_ops = None if self._cross_device_ops else (
         cross_device_ops_lib.choose_the_best(devices))
     self._host_input_device = numpy_dataset.SingleDevice(
         self._input_workers.worker_devices[0])
     self._is_multi_worker_training = False
     logging.info("Using MirroredStrategy with devices %r", devices)
     device_spec = tf_device.DeviceSpec.from_string(
         self._input_workers.worker_devices[0])
     # Ensures when we enter strategy.scope() we use the correct default device
     if device_spec.job is not None and device_spec.job != "localhost":
         self._default_device = "/job:%s/replica:%d/task:%d" % (
             device_spec.job, device_spec.replica, device_spec.task)
 def _make_input_fn_iterator(
     self,
     input_fn,
     replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
   worker = device_util.canonicalize("/device:CPU:0")
   worker_device_pairs = [(worker, [self._device])]
   return values.InputFunctionIterator(
       input_fn, worker_device_pairs,
       [distribute_lib.InputContext()])
Example #15
0
  def _make_dataset_iterator(self, dataset):
    if self._local_mode:
      worker = device_util.canonicalize("/device:CPU:0")
      worker_device_pairs = [(worker, self._devices)]
    else:
      worker_device_pairs = self._worker_devices

    return values.DatasetIterator(dataset, worker_device_pairs,
                                  self._num_replicas_in_sync)
Example #16
0
  def _initialize_multi_worker(self, num_gpus, cluster_spec):
    """Initializes the object for multi-worker training."""
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._cluster_spec = cluster_spec

    self._workers = []
    for job in ["chief", "worker"]:
      for task in range(len(cluster_spec.as_dict().get(job, []))):
        self._workers.append("/job:%s/task:%d" % (job, task))

    if num_gpus is None:
      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
    if num_gpus > 0:
      self._worker_devices = [
          (worker, [
              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
              for gpu in range(num_gpus)
          ]) for worker in self._workers
      ]
    else:
      self._worker_devices = [
          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
          for worker in self._workers
      ]

    devices = nest.flatten([l for _, l in self._worker_devices])

    # Setting `_default_device` will add a device scope in the
    # distribution.scope. We set the default device to the first worker. When
    # users specify device under distribution.scope by
    #   with tf.device("/cpu:0"):
    #     ...
    # their ops will end up on the cpu device of its first worker, e.g.
    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
    self._default_device = self._workers[0]

    assert devices, "Must specify at least one device."
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument.")
    # TODO(josh11b): Require at least 2 devices?
    self._devices = [device_util.resolve(d) for d in devices]
    self._canonical_device_set = set(self._devices)
    self._device_index = values.PerReplica(
        {d: i for i, d in enumerate(devices)})
Example #17
0
def choose_the_best(devices, session_config=None):
  """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.

  Args:
    devices: a list of devices passed to `tf.distribute.Strategy`.
    session_config: a `tf.compat.v1.ConfigProto` or `None`. If `None`, it will
      make decision based on all logical devices.

  Returns:
    A subclass of `CrossDeviceOps`.
  """
  requested_devices = set(device_util.canonicalize(d) for d in devices)
  if ops.executing_eagerly_outside_functions():
    logical_gpus = context.context().list_logical_devices(device_type="GPU")
    physical_gpus = context.context().list_physical_devices(device_type="GPU")
    if len(logical_gpus) != len(physical_gpus):
      logging.warning("NCCL is not supported when using virtual GPUs, falling"
                      "back to reduction to one device")
      return ReductionToOneDevice()

    machine_devices = context.context().list_logical_devices()
  else:
    machine_devices = device_lib.list_local_devices(
        session_config=session_config)
  using_devices = set()
  for d in machine_devices:
    if device_util.canonicalize(d.name) in requested_devices:
      using_devices.add(d.name)

  if len(using_devices) != len(requested_devices):
    logging.warning(
        "Some requested devices in `tf.distribute.Strategy` are not visible "
        "to TensorFlow: %s", ",".join(list(requested_devices - using_devices)))

  if any("gpu" not in d.lower() for d in requested_devices):
    logging.warning("There are non-GPU devices in `tf.distribute.Strategy`, "
                    "not using nccl allreduce.")
    return ReductionToOneDevice()

  if kernels.get_registered_kernels_for_op("NcclAllReduce"):
    return NcclAllReduce(num_packs=1)
  else:
    logging.warning("Nccl kernel is not found, not using nccl allreduce.")
    return ReductionToOneDevice()
  def _initialize_local(self,
                        compute_devices,
                        parameter_device,
                        cluster_resolver=None):
    """Initialize local devices for training."""
    worker_device = device_util.canonicalize("/device:CPU:0")
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

    if compute_devices is None:
      if not cluster_resolver:
        num_gpus = context.num_gpus()
      else:
        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
      # Save the num_gpus_per_worker for configure method which is used by the
      # contrib version.
      self._num_gpus_per_worker = num_gpus

      compute_devices = device_util.local_devices_from_num_gpus(num_gpus)

    compute_devices = [device_util.canonicalize(d) for d in compute_devices]

    if parameter_device is None:
      # If there is only one GPU, put everything on that GPU. Otherwise, place
      # variables on CPU.
      if len(compute_devices) == 1:
        parameter_device = compute_devices[0]
      else:
        parameter_device = _LOCAL_CPU

    self._input_workers = input_lib.InputWorkers(
        [(worker_device, compute_devices)])

    self._variable_device = parameter_device
    self._compute_devices = compute_devices
    self._parameter_devices = (parameter_device,)
    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info(
        "ParameterServerStrategy (CentralStorageStrategy if you are using a "
        "single machine) with compute_devices = %r, variable_device = %r",
        compute_devices, self._variable_device)
Example #19
0
    def _initialize_multi_worker(self, num_gpus, cluster_spec):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._cluster_spec = cluster_spec

        self._workers = []
        for job in ["chief", "worker"]:
            for task in range(len(cluster_spec.as_dict().get(job, []))):
                self._workers.append("/job:%s/task:%d" % (job, task))

        if num_gpus is None:
            raise ValueError(
                "`num_gpus` is required if `cluster_spec` is given.")
        if num_gpus > 0:
            self._worker_devices = [(worker, [
                device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
                for gpu in range(num_gpus)
            ]) for worker in self._workers]
        else:
            self._worker_devices = [
                (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
                for worker in self._workers
            ]

        devices = nest.flatten([l for _, l in self._worker_devices])

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
        self._default_device = self._workers[0]

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerReplica(
            {d: i
             for i, d in enumerate(devices)})
Example #20
0
 def _is_per_replica(self, result, expected, klass=values.PerReplica):
   self.assertIsInstance(result, klass)
   # We canonicalize the devices to match the device strings returned
   # by PerReplica, which also does device string canonicalization.
   devices = [device_util.canonicalize(_device_str(i))
              for i in range(len(expected))]
   self.assertEqual(set(devices), set(result.devices))
   for i, d in enumerate(devices):
     self.assertEqual(expected[i], result.get(d))
     self.assertEqual(expected[i], result.get(_device_str(i)))
Example #21
0
 def _is_per_replica(self, result, expected, klass=values.PerReplica):
   self.assertIsInstance(result, klass)
   # We canonicalize the devices to match the device strings returned
   # by PerReplica, which also does device string canonicalization.
   devices = [device_util.canonicalize(_device_str(i))
              for i in range(len(expected))]
   self.assertEqual(set(devices), set(result.devices))
   for i, d in enumerate(devices):
     self.assertEqual(expected[i], result.get(d))
     self.assertEqual(expected[i], result.get(_device_str(i)))
Example #22
0
  def __init__(self, worker_device_pairs):
    """Initialize an `InputWorkers` object.

    Args:
      worker_device_pairs: A sequence of pairs:
        `(input device, a tuple of compute devices fed by that input device)`.
    """
    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
                              for _, f in worker_device_pairs)
Example #23
0
  def _initialize_local(self, cluster_resolver):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    if ops.executing_eagerly_outside_functions():
      try:
        context.context().configure_collective_ops(
            scoped_allocator_enabled_ops=("CollectiveReduce",),
            use_nccl_communication=(self._communication == cross_device_ops_lib
                                    .CollectiveCommunication.NCCL))
      except RuntimeError:
        logging.warning("Collective ops is not configured at program startup. "
                        "Some performance features may not be enabled.")
      self._collective_ops_configured = True

    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    if num_gpus:
      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
    else:
      local_devices = ("/device:CPU:0",)
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    # This is a mark to tell whether we are running with standalone client or
    # independent worker. Right now with standalone client, strategy object is
    # created as local strategy and then turn into multi-worker strategy via
    # configure call.
    self._local_or_standalone_client_mode = True

    # Save the num_gpus_per_worker and rpc_layer for configure method.
    self._num_gpus_per_worker = num_gpus
    self._rpc_layer = cluster_resolver.rpc_layer
    self._warn_nccl_no_gpu()

    logging.info("Single-worker CollectiveAllReduceStrategy with local_devices "
                 "= %r, communication = %s", local_devices, self._communication)
Example #24
0
    def _reduce_to(self, reduce_op, value, destinations):
        if (isinstance(value, values.DistributedValues)
                or tensor_util.is_tensor(value)
            ) and values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
            if reduce_op == reduce_util.ReduceOp.MEAN:
                # TODO(jhseu):  Revisit once we support model-parallelism.
                value *= (1. / self._num_replicas_in_sync)
            elif reduce_op != reduce_util.ReduceOp.SUM:
                raise NotImplementedError(
                    "Currently only support sum & mean in TPUStrategy.")
            return tpu_ops.cross_replica_sum(value)

        if not isinstance(value, values.DistributedValues):
            # This function handles reducing values that are not PerReplica or
            # Mirrored values. For example, the same value could be present on all
            # replicas in which case `value` would be a single value or value could
            # be 0.
            return cross_device_ops_lib.reduce_non_distributed_value(
                reduce_op, value, destinations, self._num_replicas_in_sync)

        # TODO(cjfj): Detect when it is possible to use `cross_replica_sum`.
        # Always performs the reduction on the TPU host.
        with ops.device(self._host_device):
            output = math_ops.add_n(value.values)
            if reduce_op == reduce_util.ReduceOp.MEAN:
                output *= (1. / len(value.values))

        devices = cross_device_ops_lib.get_devices_from(destinations)

        if len(devices) == 1:
            # If necessary, copy to requested destination.
            dest_canonical = device_util.canonicalize(devices[0])
            host_canonical = device_util.canonicalize(self._host_device)

            if dest_canonical != host_canonical:
                with ops.device(dest_canonical):
                    output = array_ops.identity(output)
        else:
            output = cross_device_ops_lib.simple_broadcast(
                output, destinations)

        return output
 def verifyWorkerLocalInstance(self, coordinator, model):
   # assert capturing a worker-local resource on each worker
   for worker in coordinator._cluster.workers:
     with coordinator_context.with_dispatch_context(worker):
       captures = model.use_table.get_concrete_function().captured_inputs
       resource_capture = [t for t in captures if t.dtype == dtypes.resource]
       self.assertNotEmpty(resource_capture)
       for capture in resource_capture:
         self.assertEqual(
             capture.device,
             device_util.canonicalize("/CPU:0", default=worker.device_name))
Example #26
0
def choose_the_best(devices, session_config=None):
    """Find the best subclass of CrossDeviceOps given a session config.

  Args:
    devices: a list of devices passed to `tf.distribute.Strategy`.
    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
      decision based on all local devices.

  Returns:
    A subclass of `CrossDeviceOps`.
  """
    requested_devices = set([device_util.canonicalize(d) for d in devices])
    machine_devices = device_lib.list_local_devices(
        session_config=session_config)
    using_devices = []
    for d in machine_devices:
        if device_util.canonicalize(d.name) in requested_devices:
            using_devices.append(d)
        else:
            logging.info(
                "Device is available but not used by distribute strategy: %s",
                d.name)

    if len(using_devices) != len(requested_devices):
        logging.warning(
            "Not all devices in `tf.distribute.Strategy` are visible "
            "to TensorFlow.")
        return ReductionToOneDevice()

    if any(d.device_type.lower() != "gpu" for d in using_devices):
        logging.warning(
            "Not all devices in `tf.distribute.Strategy` are visible "
            "to TensorFlow.")
        return ReductionToOneDevice()

    device_links = [[] for _ in range(len(using_devices))]
    for i, device in enumerate(using_devices):
        for link in device.locality.links.link:
            device_links[i].append(link.device_id)

    return _choose_all_reduce_algorithm(device_links)
 def _input_workers_with_options(self, options=None):
   if not options:
     return input_lib.InputWorkers(self._input_workers_devices)
   if (options.experimental_replication_mode ==
       distribute_lib.InputReplicationMode.PER_REPLICA):
     if options.experimental_place_dataset_on_device:
       self._input_workers_devices = (
           tuple(
               (device_util.canonicalize(d, d), (d,)) for d in self._devices))
     else:
       self._input_workers_devices = (
           tuple((device_util.canonicalize("/device:CPU:0", d), (d,))
                 for d in self._devices))
     return input_lib.InputWorkers(self._input_workers_devices)
   else:
     if not options.experimental_prefetch_to_device:
       return input_lib.InputWorkers([
           (host_device, (host_device,) * len(compute_devices))
           for host_device, compute_devices in self._input_workers_devices
       ])
     else:
       return input_lib.InputWorkers(self._input_workers_devices)
def choose_the_best(devices, session_config=None):
    """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.

  Args:
    devices: a list of devices passed to `tf.distribute.Strategy`.
    session_config: a `tf.compat.v1.ConfigProto` or `None`. If `None`, it will
      make decision based on all local devices.

  Returns:
    A subclass of `CrossDeviceOps`.
  """
    requested_devices = set([device_util.canonicalize(d) for d in devices])
    machine_devices = device_lib.list_local_devices(
        session_config=session_config)
    using_devices = set()
    for d in machine_devices:
        if device_util.canonicalize(d.name) in requested_devices:
            using_devices.add(d.name)

    if len(using_devices) != len(requested_devices):
        logging.warning(
            "Some requested devices in `tf.distribute.Strategy` are not visible "
            "to TensorFlow: %s",
            ",".join(list(requested_devices - using_devices)))
        return ReductionToOneDevice()

    if any("gpu" not in d.lower() for d in using_devices):
        logging.warning(
            "There is non-GPU devices in `tf.distribute.Strategy`, not "
            "using nccl allreduce.")
        return ReductionToOneDevice()

    if kernels.get_registered_kernels_for_op("NcclAllReduce"):
        return NcclAllReduce(num_packs=1)
    else:
        logging.warning("Nccl kernel is not found, not using nccl allreduce.")
        return ReductionToOneDevice()
Example #29
0
  def _reduce_to(self, reduce_op, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if reduce_op == reduce_util.ReduceOp.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self._num_replicas_in_sync)
      elif reduce_op != reduce_util.ReduceOp.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    if not isinstance(value, values.DistributedValues):
      # This function handles reducing values that are not PerReplica or
      # Mirrored values. For example, the same value could be present on all
      # replicas in which case `value` would be a single value or value could
      # be 0.
      return cross_device_ops_lib.reduce_non_distributed_value(
          reduce_op, self._device_map, value, destinations)

    devices = cross_device_ops_lib.get_devices_from(destinations)
    if len(devices) != 1:
      raise ValueError("Multiple devices are not supported for TPUStrategy")

    # Always performs the reduction on the TPU host.
    with ops.device(self._host_device):
      output = math_ops.add_n(value.values)
      if reduce_op == reduce_util.ReduceOp.MEAN:
        output *= (1. / len(value.values))

    # If necessary, copy to requested destination.
    dest_canonical = device_util.canonicalize(devices[0])
    host_canonical = device_util.canonicalize(self._host_device)

    if dest_canonical != host_canonical:
      with ops.device(devices[0]):
        output = array_ops.identity(output)

    return output
def choose_the_best(devices, session_config=None):
  """Find the best subclass of CrossDeviceOps given a session config.

  Args:
    devices: a list of devices passed to `tf.distribute.Strategy`.
    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
      decision based on all local devices.

  Returns:
    A subclass of `CrossDeviceOps`.
  """
  requested_devices = set([device_util.canonicalize(d) for d in devices])
  machine_devices = device_lib.list_local_devices(session_config=session_config)
  using_devices = []
  for d in machine_devices:
    if device_util.canonicalize(d.name) in requested_devices:
      using_devices.append(d)
    else:
      logging.info(
          "Device is available but not used by distribute strategy: %s", d.name)

  if len(using_devices) != len(requested_devices):
    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                    "to TensorFlow.")
    return ReductionToOneDevice()

  if any(d.device_type.lower() != "gpu" for d in using_devices):
    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                    "to TensorFlow.")
    return ReductionToOneDevice()

  device_links = [[] for _ in range(len(using_devices))]
  for i, device in enumerate(using_devices):
    for link in device.locality.links.link:
      device_links[i].append(link.device_id)

  return _choose_all_reduce_algorithm(device_links)
Example #31
0
  def _reduce_to(self, reduce_op, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if reduce_op == reduce_util.ReduceOp.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self._num_replicas_in_sync)
      elif reduce_op != reduce_util.ReduceOp.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_device_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host_device)
    else:
      raise ValueError("Multiple devices are not supported for TPUStrategy")

    output = math_ops.add_n(value)
    if reduce_op == reduce_util.ReduceOp.MEAN:
      return output * (1. / len(value))
    return output
Example #32
0
 def testCanonicalizeWithoutDefaultDeviceCollectiveEnabled(self):
     cluster_spec = server_lib.ClusterSpec(
         multi_worker_test_base.create_cluster_spec(has_chief=False,
                                                    num_workers=1,
                                                    num_ps=0,
                                                    has_eval=False))
     server_def = tensorflow_server_pb2.ServerDef(
         cluster=cluster_spec.as_cluster_def(),
         job_name="worker",
         task_index=0,
         protocol="grpc",
         port=0)
     context.context().enable_collective_ops(server_def)
     self.assertEqual(device_util.canonicalize("/cpu:0"),
                      "/job:worker/replica:0/task:0/device:CPU:0")
Example #33
0
  def _reduce_to(self, reduce_op, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if reduce_op == reduce_util.ReduceOp.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self._num_replicas_in_sync)
      elif reduce_op != reduce_util.ReduceOp.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_device_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host_device)
    else:
      raise ValueError("Multiple devices are not supported for TPUStrategy")

    output = math_ops.add_n(value)
    if reduce_op == reduce_util.ReduceOp.MEAN:
      return output * (1. / len(value))
    return output
Example #34
0
    def __init__(self,
                 devices,
                 group_size,
                 collective_keys=None,
                 communication=CollectiveCommunication.AUTO):
        """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
        if group_size % len(devices) > 0:
            raise ValueError(
                "group_size must be divisible by the number of devices.")

        self._devices = tuple(device_util.canonicalize(d) for d in devices)
        self._group_size = group_size
        self._collective_keys = (collective_keys
                                 or cross_device_utils.CollectiveKeys())
        self._communication = communication
        # This lock guards all collective launches, i.e. calls to
        # cross_device_utils.build_collectve_*.
        #
        # In a multi threaded eager program we need to ensure different groups of
        # collectives don't interleave each other, otherwise there couuld be
        # deadlocks. E.g. if two user threads both are launching collectives:
        #   user-thread-0  device0                 device1
        #   user-thread-1          device0 device1
        # In eager mode, we use one executor per device. Executors use single FIFO
        # queues, so the above launch sequences end up with the following queues:
        #   device-0  collective-0  collective-1
        #   device-1  collective-1  collective-0
        # This deadlocks since neither collective is able to finish.
        self._lock = threading.Lock()

        # Collective ops requires all devices to participate and is blocking. In
        # eager, we need one async executor for each device to be able to launch
        # them altogether. Note that async doesn't imply concurrency. Within an
        # async executor operations are still executed sequentially. In graph or
        # function building, the executors are not used.
        self._executors = []
        for _ in range(len(devices)):
            self._executors.append(executor.new_executor(enable_async=True))

        super(CollectiveAllReduce, self).__init__()
Example #35
0
    def testDefaultDeviceInsideFunctionWithScope(self, distribution,
                                                 run_functions_eagerly):

        def_function.run_functions_eagerly(run_functions_eagerly)
        expected_device = (device_util.canonicalize("cpu:0")
                           if run_functions_eagerly else "")
        with distribution.scope():
            with ops.device_v2("cpu:0"):

                @def_function.function
                def add():
                    one = array_ops.ones([])
                    self.assertEqual(expected_device, one.device)
                    return one + 1

                add()
    def _initialize_local(self, cluster_resolver):
        """Initialize internal devices for local training."""
        worker_device = device_util.canonicalize("/device:CPU:0")
        self._input_host_device = numpy_dataset.SingleDevice(worker_device)

        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        # Save the num_gpus_per_worker for configure method.
        self._num_gpus_per_worker = num_gpus

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus > 0:
            compute_devices = tuple(
                map("/device:GPU:{}".format, range(num_gpus)))
        else:
            compute_devices = (_LOCAL_CPU, )

        self._device_map = values.ReplicaDeviceMap(compute_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(worker_device, compute_devices)])

        # If there is only one GPU, put everything on that GPU. Otherwise, place
        # variables on CPU.
        if num_gpus == 1:
            assert len(compute_devices) == 1
            self._variable_device = _LOCAL_GPU_0
            self._parameter_devices = (_LOCAL_GPU_0, )
        else:
            self._variable_device = _LOCAL_CPU
            self._parameter_devices = (_LOCAL_CPU, )

        self._is_chief = True
        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info(
            "ParameterServerStrategy with compute_devices = %r, "
            "variable_device = %r", compute_devices, self._variable_device)
    def _initialize_local(self, cluster_resolver):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("/device:GPU:%d" % i
                                  for i in range(num_gpus))
        else:
            local_devices = ("/device:CPU:0", )
        self._worker_device = device_util.canonicalize("/device:CPU:0")
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        # This is a mark to tell whether we are running with standalone client or
        # independent worker. Right now with standalone client, strategy object is
        # created as local strategy and then turn into multi-worker strategy via
        # configure call.
        self._local_or_standalone_client_mode = True

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)
Example #38
0
 def _start_check_health_thread(self):
     # Allocate group and instance key before starting the thread to avoid
     # indeterminism. There can only be one thread that assigns group keys and
     # instance keys, otherwise different workers may end up with unmatched keys
     # since execution order between threads are arbitrary.
     device = device_util.canonicalize(self._worker_device)
     group_key = self._collective_keys.get_group_key([device])
     instance_key = self._collective_keys.get_op_instance_key()
     self._check_health_thread_should_stop = threading.Event()
     # Start the thread as daemon to avoid it blocking the program from exiting.
     # We try best to shutdown the thread but __del__ is not guaranteed to be
     # called when program exists.
     self._check_health_thread = threading.Thread(target=self._check_health,
                                                  args=(device, group_key,
                                                        instance_key),
                                                  daemon=True)
     self._check_health_thread.start()
Example #39
0
  def _make_dataset_iterator(self, dataset):
    """Make iterator from dataset without splitting the batch.

    This implementation is different than the one in
    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
    We treat the incoming dataset's batch size as per replica batch size.

    Args:
      dataset: `tf.data.Dataset` for input.
    Returns:
      An `InputIterator` which returns inputs for each step of the computation.
    """
    if self._local_mode:
      worker = device_util.canonicalize("/device:CPU:0")
      worker_device_pairs = [(worker, self._devices)]
    else:
      worker_device_pairs = self._worker_devices
    return values.DatasetIterator(dataset, worker_device_pairs)
Example #40
0
    def _make_dataset_iterator(self, dataset):
        """Make iterator from dataset without splitting the batch.

    This implementation is different than the one in
    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
    We treat the incoming dataset's batch size as per replica batch size.

    Args:
      dataset: `tf.data.Dataset` for input.
    Returns:
      An `InputIterator` which returns inputs for each step of the computation.
    """
        if self._local_mode:
            worker = device_util.canonicalize("/device:CPU:0")
            worker_device_pairs = [(worker, self._devices)]
        else:
            worker_device_pairs = self._worker_devices
        return values.DatasetIterator(dataset, worker_device_pairs)
    def testInModelAndCapture(self, source):

        file_path = os.path.join(self.get_temp_dir(), "text_file_initializer")

        model = self.Model(source, file_path)
        func_captures = model.use_table.get_concrete_function(
        ).graph.external_captures
        self.assertLen(func_captures, 2)
        self.assertTrue(
            any(model.table.resource_handle is t for t in func_captures))
        deferred_captures = model.use_table.get_concrete_function(
        ).graph.deferred_external_captures
        self.assertEmpty(deferred_captures)

        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            self.cluster_resolver)
        coordinator = coordinator_lib.ClusterCoordinator(strategy)
        with strategy.scope():
            distributed_model = self.Model("value", file_path)
        func_captures = distributed_model.use_table.get_concrete_function(
        ).graph.external_captures
        # One less external_capture, since the table handle becomes a closure in the
        # deferred_external_capture
        self.assertLen(func_captures, 1)
        self.assertFalse(
            any(model.table.resource_handle is t for t in func_captures))
        deferred_captures = distributed_model.use_table.get_concrete_function(
        ).graph.deferred_external_captures
        self.assertNotEmpty(deferred_captures)

        # assert capturing a worker-local resource on each worker
        for worker in coordinator._cluster.workers:
            with coordinator_context.with_dispatch_context(worker):
                for capture in [
                        t for t in distributed_model.use_table.
                        get_concrete_function().captured_inputs
                        if t.dtype == dtypes.resource
                ]:
                    if capture.dtype == dtypes.resource:
                        self.assertEqual(
                            capture.device,
                            device_util.canonicalize(
                                "/CPU:0", default=worker.device_name))
Example #42
0
  def testDefaultDeviceInsideFunctionWithScope(
      self, distribution, run_functions_eagerly):

    def_function.run_functions_eagerly(run_functions_eagerly)
    try:
      worker = distribution.extended.worker_devices[0]
    except RuntimeError:
      worker = None
    expected_device = (device_util.canonicalize("cpu:0", worker)
                       if run_functions_eagerly else "")
    with distribution.scope():
      with ops.device_v2("cpu:0"):
        @def_function.function
        def add():
          one = array_ops.ones([])
          self.assertEqual(expected_device, one.device)
          return one + 1

        add()
Example #43
0
 def handle(self):
     if values_util.is_saving_non_distributed():
         return self._primary_handle
     tpu_context = tpu_util.enclosing_tpu_context()
     if tpu_context and not context.executing_eagerly():
         is_mirrored = (self._variables[0].synchronization !=
                        variables_lib.VariableSynchronization.ON_READ)
         if self._packed_handle is None:
             handles = [v.handle for v in self._variables]
             is_packed = False
         else:
             handles = [self._packed_handle]
             is_packed = True
         return tpu_context.get_replicated_var_handle(
             self._unique_id, handles, is_mirrored, is_packed)
     if self._packed_handle is not None and not context.executing_eagerly():
         return self._packed_handle
     device = device_util.canonicalize(device_util.current())
     return self._device_to_handle.get(device, self._primary_handle)
  def _initialize_local(self, cluster_resolver):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    if num_gpus:
      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
    else:
      local_devices = ("/device:CPU:0",)
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    # This is a mark to tell whether we are running with standalone client or
    # independent worker. Right now with standalone client, strategy object is
    # created as local strategy and then turn into multi-worker strategy via
    # configure call.
    self._local_or_standalone_client_mode = True

    # Save the num_gpus_per_worker and rpc_layer for configure method.
    self._num_gpus_per_worker = num_gpus
    self._rpc_layer = cluster_resolver.rpc_layer

    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                 local_devices)
Example #45
0
  def _make_input_fn_iterator(
      self,
      input_fn,
      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
    input_contexts = []
    if self._local_mode:
      num_workers = 1
      worker = device_util.canonicalize("/device:CPU:0")
      worker_device_pairs = [(worker, self._devices)]
    else:
      num_workers = len(self._worker_devices)
      worker_device_pairs = self._worker_devices

    for i in range(num_workers):
      input_contexts.append(distribute_lib.InputContext(
          num_input_pipelines=num_workers,
          input_pipeline_id=i,
          num_replicas_in_sync=self._num_replicas_in_sync))
    return values.InputFunctionIterator(
        input_fn, worker_device_pairs, input_contexts)
  def _initialize_local(self,
                        compute_devices,
                        parameter_device,
                        cluster_resolver=None):
    """Initialize internal devices for local training."""
    worker_device = device_util.canonicalize("/device:CPU:0")
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

    if compute_devices is None:
      if not cluster_resolver:
        num_gpus = context.num_gpus()
      else:
        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
        # Save the num_gpus_per_worker for configure method which is used by the
        # contrib version.
        self._num_gpus_per_worker = num_gpus

      compute_devices = device_util.local_devices_from_num_gpus(num_gpus)

    if parameter_device is None:
      # If there is only one GPU, put everything on that GPU. Otherwise, place
      # variables on CPU.
      if len(compute_devices) == 1:
        parameter_device = compute_devices[0]
      else:
        parameter_device = _LOCAL_CPU

    self._device_map = values.ReplicaDeviceMap(compute_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(worker_device, compute_devices)])

    self._variable_device = parameter_device
    self._parameter_devices = (parameter_device,)
    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info(
        "ParameterServerStrategy with compute_devices = %r, "
        "variable_device = %r", compute_devices, self._variable_device)
Example #47
0
  def __init__(self,
               devices,
               group_size,
               collective_keys=None,
               communication=CollectiveCommunication.AUTO):
    """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
    if group_size % len(devices) > 0:
      raise ValueError("group_size must be divisible by the number of devices.")

    self._devices = tuple(device_util.canonicalize(d) for d in devices)
    self._group_size = group_size
    self._collective_keys = (collective_keys or
                             cross_device_utils.CollectiveKeys())
    self._communication = communication
    # In a multi threaded eager program we need to ensure different groups of
    # collectives don't interleave each other, otherwise there will be deadlock.
    self._lock = threading.Lock()

    # Collective ops requires all devices to participate and is blocking. In
    # eager, we need one async executor for each device to be able to launch
    # them altogether. Note that async doesn't imply concurrency. Within an
    # async executor operations are still executed sequentially. In graph or
    # function building, the executors are not used.
    self._executors = []
    for _ in range(len(devices)):
      self._executors.append(executor.new_executor(enable_async=True))

    super(CollectiveAllReduce, self).__init__()
Example #48
0
 def handle(self):
     if values_util.is_saving_non_distributed():
         return self._primary_handle
     tpu_context = tpu_util.enclosing_tpu_context()
     if tpu_context and not context.executing_eagerly():
         is_mirrored = (self._variables[0].synchronization !=
                        variables_lib.VariableSynchronization.ON_READ)
         if self._packed_handle is None:
             handles = [v.handle for v in self._variables]
             is_packed = False
         else:
             handles = [self._packed_handle]
             is_packed = True
         common_name = self._handle_name
         # BaseResourceVariable appends ":0" to the handle name, which makes it not
         # a valid root scope name.
         if ":" in common_name:
             common_name = common_name.split(":")[0]
         return tpu_context.get_replicated_var_handle(
             common_name, self._unique_id, handles, is_mirrored, is_packed)
     if self._packed_handle is not None and not context.executing_eagerly():
         return self._packed_handle
     device = device_util.canonicalize(device_util.current())
     return self._device_to_handle.get(device, self._primary_handle)
 def _make_dataset_iterator(self, dataset):
   """Make iterator from dataset without splitting the batch."""
   worker = device_util.canonicalize("/device:CPU:0")
   worker_device_pairs = [(worker, [self._device])]
   return values.DatasetIterator(dataset, worker_device_pairs)
      def model_fn():
        if 'CPU' in compute_device:
          replica_compute_device = '/device:CPU:0'
        else:
          replica_id = _get_replica_id_integer()
          replica_compute_device = ('/device:GPU:%d' % replica_id)
        replica_compute_device = device_util.canonicalize(
            replica_compute_device)

        if 'CPU' in variable_device:
          replica_variable_device = '/device:CPU:0'
        else:
          replica_id = _get_replica_id_integer()
          replica_variable_device = ('/device:GPU:%d' % replica_id)
        replica_variable_device = device_util.canonicalize(
            replica_variable_device)

        a = constant_op.constant(1.0)
        b = constant_op.constant(2.0)
        c = a + b
        self.assertEqual(a.device, replica_compute_device)
        self.assertEqual(b.device, replica_compute_device)
        self.assertEqual(c.device, replica_compute_device)

        # The device scope is ignored for variables but not for normal ops.
        with ops.device('/device:GPU:2'):
          x = variable_scope.get_variable(
              'x', initializer=10.0,
              aggregation=variable_scope.VariableAggregation.SUM)
          x_add = x.assign_add(c)
          e = a + c
        self.assertEqual(
            device_util.canonicalize(x.device), replica_variable_device)
        self.assertEqual(x_add.device, x.device)
        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))

        # The colocate_vars_with can override the distribution's device.
        with d.extended.colocate_vars_with(x):
          y = variable_scope.get_variable(
              'y', initializer=20.0,
              aggregation=variable_scope.VariableAggregation.SUM)
        # We add an identity here to avoid complaints about summing
        # non-distributed values.
        y_add = y.assign_add(array_ops.identity(x_add))
        self.assertEqual(
            device_util.canonicalize(y.device), replica_variable_device)
        self.assertEqual(y_add.device, y.device)
        self.assertEqual(y.device, x.device)

        z = variable_scope.get_variable(
            'z', initializer=10.0,
            aggregation=variable_scope.VariableAggregation.SUM)
        self.assertEqual(
            device_util.canonicalize(z.device), replica_variable_device)

        with ops.control_dependencies([y_add]):
          # We add an identity here to avoid complaints about summing
          # non-distributed values.
          z_add = z.assign_add(array_ops.identity(y))
        with ops.control_dependencies([z_add]):
          f = z + c
        self.assertEqual(f.device, replica_compute_device)

        # The device scope would merge with the default worker device.
        with ops.device('/CPU:1'):
          g = e + 1.0
        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))

        # Ths ops.colocate_with will be ignored when defining a variale but not
        # for a normal tensor.
        with ops.colocate_with(x):
          u = variable_scope.get_variable('u', initializer=30.0)
          h = f + 1.0
        self.assertEqual(
            device_util.canonicalize(u.device), replica_variable_device)
        self.assertEqual(
            device_util.canonicalize(x.device),
            device_util.canonicalize(h.device))
        return y_add, z_add, f