def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) if not isinstance(value, values.DistributedValues): # This function handles reducing values that are not PerReplica or # Mirrored values. For example, the same value could be present on all # replicas in which case `value` would be a single value or value could # be 0. return cross_device_ops_lib.reduce_non_distributed_value( reduce_op, self._device_map, value, destinations) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host_device) else: raise ValueError("Multiple devices are not supported for TPUStrategy") output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) return output
def __init__(self, device_map, worker_device_pairs=None, logical_device=0): """Initialize an `InputWorkers` object. Args: device_map: A `DeviceMap` with the computation devices fed by the input workers. worker_device_pairs: A sequence of pairs: `(input device, a tuple of compute devices fed by that input device)`. logical_device: The logical device of `device_map` to feed. """ self._device_map = device_map self._logical_device = logical_device if worker_device_pairs is None: worker_device_pairs = (( device_util.canonicalize("/device:CPU:0"), device_map.logical_to_actual_devices(logical_device)),) self._input_worker_devices = tuple(d for d, _ in worker_device_pairs) self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f) for _, f in worker_device_pairs) flattened = tuple(d for l in self._fed_devices for d in l) assert (flattened == device_map.logical_to_actual_devices(logical_device)), ( "flattened: %s logical device %d: %s" % (flattened, logical_device, device_map.logical_to_actual_devices(logical_device)))
def testCanonicalizeWithDefaultDevice(self): self.assertEqual( device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"), "/job:worker/replica:0/task:1/device:CPU:0") self.assertEqual( device_util.canonicalize("/job:worker/task:1", default="/gpu:0"), "/job:worker/replica:0/task:1/device:GPU:0") self.assertEqual( device_util.canonicalize("/cpu:0", default="/job:worker"), "/job:worker/replica:0/task:0/device:CPU:0")
def _initialize_local(self, num_gpus_per_worker): """Initialize internal devices for local training.""" self._worker_device = device_util.canonicalize("/device:CPU:0") # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus_per_worker > 0: self._compute_devices = list( map("/device:GPU:{}".format, range(num_gpus_per_worker))) else: self._compute_devices = [_LOCAL_CPU] self._compute_devices = list( map(device_util.resolve, self._compute_devices)) self._canonical_compute_device_set = set(self._compute_devices) # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if num_gpus_per_worker == 1: assert len(list(self._compute_devices)) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = [_LOCAL_GPU_0] else: self._variable_device = _LOCAL_CPU self._parameter_devices = [_LOCAL_CPU] self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", self._compute_devices, self._variable_device)
def _initialize_local_worker(self, num_gpus_per_worker): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 if num_gpus_per_worker: local_devices = [ "/device:GPU:%d" % i for i in range(num_gpus_per_worker) ] else: local_devices = ["/device:CPU:0"] self._worker_device = device_util.canonicalize("/device:CPU:0") self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) self._cluster_spec = None self._task_type = None self._task_id = None logging.info("CollectiveAllReduceStrategy with local_devices = %r", local_devices)
def _initialize_local_worker(self, num_gpus_per_worker): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 if num_gpus_per_worker: local_devices = tuple( "/device:GPU:%d" % i for i in range(num_gpus_per_worker) ) else: local_devices = ("/device:CPU:0",) self._worker_device = device_util.canonicalize("/device:CPU:0") self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce. self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) self._cluster_spec = None self._task_type = None self._task_id = None logging.info("CollectiveAllReduceStrategy with local_devices = %r", local_devices)
def _initialize_local(self, cluster_resolver): """Initialize internal devices for local training.""" worker_device = device_util.canonicalize("/device:CPU:0") self._input_host_device = numpy_dataset.SingleDevice(worker_device) num_gpus = cluster_resolver.num_accelerators() # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus > 0: compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus))) else: compute_devices = (_LOCAL_CPU,) self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if num_gpus == 1: assert len(compute_devices) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = (_LOCAL_GPU_0,) else: self._variable_device = _LOCAL_CPU self._parameter_devices = (_LOCAL_CPU,) self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", compute_devices, self._variable_device)
def __init__(self, container_strategy, device): super(OneDeviceExtended, self).__init__(container_strategy) self._device = device self._input_device = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(self._input_device, [self._device])] device_map = values.SingleDeviceMap(device) self._input_workers = input_lib.InputWorkers( device_map, worker_device_pairs)
def _device_scope(self): if (self._packed_handle is None or values_util.is_saving_non_distributed() or tpu_util.enclosing_tpu_context() is not None): return ops.NullContextmanager() device = device_util.canonicalize(device_util.current()) if device in self._device_to_handle: return ops.NullContextmanager() return ops.device(self._primary_handle.device)
def _make_dataset_iterator(self, dataset): if self._local_mode: worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, self._devices)] else: worker_device_pairs = self._worker_devices return values.DatasetIterator(dataset, worker_device_pairs, self._num_replicas_in_sync)
def __init__(self, container_strategy, device): super(OneDeviceExtended, self).__init__(container_strategy) self._device = device self._default_device = device self._input_device = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(self._input_device, [self._device])] device_map = values.SingleDeviceMap(device) self._input_workers = input_lib.InputWorkers( device_map, worker_device_pairs)
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._device_assignment = device_assignment self._tpu_devices = [d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name] # Only create variables for the number of replicas we're running. if device_assignment is not None: job_name = device_spec.DeviceSpecV2.from_string(self._tpu_devices[0]).job self._tpu_devices = [] for replica_id in range(device_assignment.num_replicas): tpu_device = device_assignment.tpu_device( replica=replica_id, logical_core=0, job=job_name) tpu_device = device_util.canonicalize(tpu_device) self._tpu_devices.append(tpu_device) self._host_device = device_util.get_host_for_device(self._tpu_devices[0]) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_worker_devices = tuple(input_worker_devices.items()) self._input_workers_obj = None # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't # need to retrace functions for each device. self._retrace_functions_for_each_device = False self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True self._prefetch_on_host = False
def _initialize_single_worker(self, devices): """Initializes the object for single-worker training.""" self._devices = tuple(device_util.canonicalize(d) for d in devices) self._input_workers = input_lib.InputWorkers( ((device_util.canonicalize("/device:CPU:0", devices[0]), devices), )) self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) self._host_input_device = numpy_dataset.SingleDevice( self._input_workers.worker_devices[0]) self._is_multi_worker_training = False logging.info("Using MirroredStrategy with devices %r", devices) device_spec = tf_device.DeviceSpec.from_string( self._input_workers.worker_devices[0]) # Ensures when we enter strategy.scope() we use the correct default device if device_spec.job is not None and device_spec.job != "localhost": self._default_device = "/job:%s/replica:%d/task:%d" % ( device_spec.job, device_spec.replica, device_spec.task)
def _make_input_fn_iterator( self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, [self._device])] return values.InputFunctionIterator( input_fn, worker_device_pairs, [distribute_lib.InputContext()])
def _make_dataset_iterator(self, dataset): if self._local_mode: worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, self._devices)] else: worker_device_pairs = self._worker_devices return values.DatasetIterator(dataset, worker_device_pairs, self._num_replicas_in_sync)
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError("`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_devices = [ (worker, [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ]) for worker in self._workers ] else: self._worker_devices = [ (worker, [device_util.canonicalize(worker, "/device:CPU:0")]) for worker in self._workers ] devices = nest.flatten([l for _, l in self._worker_devices]) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)})
def choose_the_best(devices, session_config=None): """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`. Args: devices: a list of devices passed to `tf.distribute.Strategy`. session_config: a `tf.compat.v1.ConfigProto` or `None`. If `None`, it will make decision based on all logical devices. Returns: A subclass of `CrossDeviceOps`. """ requested_devices = set(device_util.canonicalize(d) for d in devices) if ops.executing_eagerly_outside_functions(): logical_gpus = context.context().list_logical_devices(device_type="GPU") physical_gpus = context.context().list_physical_devices(device_type="GPU") if len(logical_gpus) != len(physical_gpus): logging.warning("NCCL is not supported when using virtual GPUs, falling" "back to reduction to one device") return ReductionToOneDevice() machine_devices = context.context().list_logical_devices() else: machine_devices = device_lib.list_local_devices( session_config=session_config) using_devices = set() for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.add(d.name) if len(using_devices) != len(requested_devices): logging.warning( "Some requested devices in `tf.distribute.Strategy` are not visible " "to TensorFlow: %s", ",".join(list(requested_devices - using_devices))) if any("gpu" not in d.lower() for d in requested_devices): logging.warning("There are non-GPU devices in `tf.distribute.Strategy`, " "not using nccl allreduce.") return ReductionToOneDevice() if kernels.get_registered_kernels_for_op("NcclAllReduce"): return NcclAllReduce(num_packs=1) else: logging.warning("Nccl kernel is not found, not using nccl allreduce.") return ReductionToOneDevice()
def _initialize_local(self, compute_devices, parameter_device, cluster_resolver=None): """Initialize local devices for training.""" worker_device = device_util.canonicalize("/device:CPU:0") self._input_host_device = numpy_dataset.SingleDevice(worker_device) if compute_devices is None: if not cluster_resolver: num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) # Save the num_gpus_per_worker for configure method which is used by the # contrib version. self._num_gpus_per_worker = num_gpus compute_devices = device_util.local_devices_from_num_gpus(num_gpus) compute_devices = [device_util.canonicalize(d) for d in compute_devices] if parameter_device is None: # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if len(compute_devices) == 1: parameter_device = compute_devices[0] else: parameter_device = _LOCAL_CPU self._input_workers = input_lib.InputWorkers( [(worker_device, compute_devices)]) self._variable_device = parameter_device self._compute_devices = compute_devices self._parameter_devices = (parameter_device,) self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy (CentralStorageStrategy if you are using a " "single machine) with compute_devices = %r, variable_device = %r", compute_devices, self._variable_device)
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_devices = [(worker, [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ]) for worker in self._workers] else: self._worker_devices = [ (worker, [device_util.canonicalize(worker, "/device:CPU:0")]) for worker in self._workers ] devices = nest.flatten([l for _, l in self._worker_devices]) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)})
def _is_per_replica(self, result, expected, klass=values.PerReplica): self.assertIsInstance(result, klass) # We canonicalize the devices to match the device strings returned # by PerReplica, which also does device string canonicalization. devices = [device_util.canonicalize(_device_str(i)) for i in range(len(expected))] self.assertEqual(set(devices), set(result.devices)) for i, d in enumerate(devices): self.assertEqual(expected[i], result.get(d)) self.assertEqual(expected[i], result.get(_device_str(i)))
def _is_per_replica(self, result, expected, klass=values.PerReplica): self.assertIsInstance(result, klass) # We canonicalize the devices to match the device strings returned # by PerReplica, which also does device string canonicalization. devices = [device_util.canonicalize(_device_str(i)) for i in range(len(expected))] self.assertEqual(set(devices), set(result.devices)) for i, d in enumerate(devices): self.assertEqual(expected[i], result.get(d)) self.assertEqual(expected[i], result.get(_device_str(i)))
def __init__(self, worker_device_pairs): """Initialize an `InputWorkers` object. Args: worker_device_pairs: A sequence of pairs: `(input device, a tuple of compute devices fed by that input device)`. """ self._input_worker_devices = tuple(d for d, _ in worker_device_pairs) self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f) for _, f in worker_device_pairs)
def _initialize_local(self, cluster_resolver): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 if ops.executing_eagerly_outside_functions(): try: context.context().configure_collective_ops( scoped_allocator_enabled_ops=("CollectiveReduce",), use_nccl_communication=(self._communication == cross_device_ops_lib .CollectiveCommunication.NCCL)) except RuntimeError: logging.warning("Collective ops is not configured at program startup. " "Some performance features may not be enabled.") self._collective_ops_configured = True # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus)) else: local_devices = ("/device:CPU:0",) self._worker_device = device_util.canonicalize("/device:CPU:0") self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce. self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) self._cluster_spec = None self._task_type = None self._task_id = None # This is a mark to tell whether we are running with standalone client or # independent worker. Right now with standalone client, strategy object is # created as local strategy and then turn into multi-worker strategy via # configure call. self._local_or_standalone_client_mode = True # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info("Single-worker CollectiveAllReduceStrategy with local_devices " "= %r, communication = %s", local_devices, self._communication)
def _reduce_to(self, reduce_op, value, destinations): if (isinstance(value, values.DistributedValues) or tensor_util.is_tensor(value) ) and values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) if not isinstance(value, values.DistributedValues): # This function handles reducing values that are not PerReplica or # Mirrored values. For example, the same value could be present on all # replicas in which case `value` would be a single value or value could # be 0. return cross_device_ops_lib.reduce_non_distributed_value( reduce_op, value, destinations, self._num_replicas_in_sync) # TODO(cjfj): Detect when it is possible to use `cross_replica_sum`. # Always performs the reduction on the TPU host. with ops.device(self._host_device): output = math_ops.add_n(value.values) if reduce_op == reduce_util.ReduceOp.MEAN: output *= (1. / len(value.values)) devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: # If necessary, copy to requested destination. dest_canonical = device_util.canonicalize(devices[0]) host_canonical = device_util.canonicalize(self._host_device) if dest_canonical != host_canonical: with ops.device(dest_canonical): output = array_ops.identity(output) else: output = cross_device_ops_lib.simple_broadcast( output, destinations) return output
def verifyWorkerLocalInstance(self, coordinator, model): # assert capturing a worker-local resource on each worker for worker in coordinator._cluster.workers: with coordinator_context.with_dispatch_context(worker): captures = model.use_table.get_concrete_function().captured_inputs resource_capture = [t for t in captures if t.dtype == dtypes.resource] self.assertNotEmpty(resource_capture) for capture in resource_capture: self.assertEqual( capture.device, device_util.canonicalize("/CPU:0", default=worker.device_name))
def choose_the_best(devices, session_config=None): """Find the best subclass of CrossDeviceOps given a session config. Args: devices: a list of devices passed to `tf.distribute.Strategy`. session_config: a `tf.ConfigProto` or `None`. If `None`, it will make decision based on all local devices. Returns: A subclass of `CrossDeviceOps`. """ requested_devices = set([device_util.canonicalize(d) for d in devices]) machine_devices = device_lib.list_local_devices( session_config=session_config) using_devices = [] for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.append(d) else: logging.info( "Device is available but not used by distribute strategy: %s", d.name) if len(using_devices) != len(requested_devices): logging.warning( "Not all devices in `tf.distribute.Strategy` are visible " "to TensorFlow.") return ReductionToOneDevice() if any(d.device_type.lower() != "gpu" for d in using_devices): logging.warning( "Not all devices in `tf.distribute.Strategy` are visible " "to TensorFlow.") return ReductionToOneDevice() device_links = [[] for _ in range(len(using_devices))] for i, device in enumerate(using_devices): for link in device.locality.links.link: device_links[i].append(link.device_id) return _choose_all_reduce_algorithm(device_links)
def _input_workers_with_options(self, options=None): if not options: return input_lib.InputWorkers(self._input_workers_devices) if (options.experimental_replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA): if options.experimental_place_dataset_on_device: self._input_workers_devices = ( tuple( (device_util.canonicalize(d, d), (d,)) for d in self._devices)) else: self._input_workers_devices = ( tuple((device_util.canonicalize("/device:CPU:0", d), (d,)) for d in self._devices)) return input_lib.InputWorkers(self._input_workers_devices) else: if not options.experimental_prefetch_to_device: return input_lib.InputWorkers([ (host_device, (host_device,) * len(compute_devices)) for host_device, compute_devices in self._input_workers_devices ]) else: return input_lib.InputWorkers(self._input_workers_devices)
def choose_the_best(devices, session_config=None): """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`. Args: devices: a list of devices passed to `tf.distribute.Strategy`. session_config: a `tf.compat.v1.ConfigProto` or `None`. If `None`, it will make decision based on all local devices. Returns: A subclass of `CrossDeviceOps`. """ requested_devices = set([device_util.canonicalize(d) for d in devices]) machine_devices = device_lib.list_local_devices( session_config=session_config) using_devices = set() for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.add(d.name) if len(using_devices) != len(requested_devices): logging.warning( "Some requested devices in `tf.distribute.Strategy` are not visible " "to TensorFlow: %s", ",".join(list(requested_devices - using_devices))) return ReductionToOneDevice() if any("gpu" not in d.lower() for d in using_devices): logging.warning( "There is non-GPU devices in `tf.distribute.Strategy`, not " "using nccl allreduce.") return ReductionToOneDevice() if kernels.get_registered_kernels_for_op("NcclAllReduce"): return NcclAllReduce(num_packs=1) else: logging.warning("Nccl kernel is not found, not using nccl allreduce.") return ReductionToOneDevice()
def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) if not isinstance(value, values.DistributedValues): # This function handles reducing values that are not PerReplica or # Mirrored values. For example, the same value could be present on all # replicas in which case `value` would be a single value or value could # be 0. return cross_device_ops_lib.reduce_non_distributed_value( reduce_op, self._device_map, value, destinations) devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) != 1: raise ValueError("Multiple devices are not supported for TPUStrategy") # Always performs the reduction on the TPU host. with ops.device(self._host_device): output = math_ops.add_n(value.values) if reduce_op == reduce_util.ReduceOp.MEAN: output *= (1. / len(value.values)) # If necessary, copy to requested destination. dest_canonical = device_util.canonicalize(devices[0]) host_canonical = device_util.canonicalize(self._host_device) if dest_canonical != host_canonical: with ops.device(devices[0]): output = array_ops.identity(output) return output
def choose_the_best(devices, session_config=None): """Find the best subclass of CrossDeviceOps given a session config. Args: devices: a list of devices passed to `tf.distribute.Strategy`. session_config: a `tf.ConfigProto` or `None`. If `None`, it will make decision based on all local devices. Returns: A subclass of `CrossDeviceOps`. """ requested_devices = set([device_util.canonicalize(d) for d in devices]) machine_devices = device_lib.list_local_devices(session_config=session_config) using_devices = [] for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.append(d) else: logging.info( "Device is available but not used by distribute strategy: %s", d.name) if len(using_devices) != len(requested_devices): logging.warning("Not all devices in `tf.distribute.Strategy` are visible " "to TensorFlow.") return ReductionToOneDevice() if any(d.device_type.lower() != "gpu" for d in using_devices): logging.warning("Not all devices in `tf.distribute.Strategy` are visible " "to TensorFlow.") return ReductionToOneDevice() device_links = [[] for _ in range(len(using_devices))] for i, device in enumerate(using_devices): for link in device.locality.links.link: device_links[i].append(link.device_id) return _choose_all_reduce_algorithm(device_links)
def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host_device) else: raise ValueError("Multiple devices are not supported for TPUStrategy") output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) return output
def testCanonicalizeWithoutDefaultDeviceCollectiveEnabled(self): cluster_spec = server_lib.ClusterSpec( multi_worker_test_base.create_cluster_spec(has_chief=False, num_workers=1, num_ps=0, has_eval=False)) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), job_name="worker", task_index=0, protocol="grpc", port=0) context.context().enable_collective_ops(server_def) self.assertEqual(device_util.canonicalize("/cpu:0"), "/job:worker/replica:0/task:0/device:CPU:0")
def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host_device) else: raise ValueError("Multiple devices are not supported for TPUStrategy") output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) return output
def __init__(self, devices, group_size, collective_keys=None, communication=CollectiveCommunication.AUTO): """Initializes the object. Args: devices: a list of device strings to run collectives on. group_size: the global group size. For between-graph replicated training it's the total number of devices across all workers. collective_keys: an optional CollectiveKey object. communication: indicates which collective communication to use. """ if group_size % len(devices) > 0: raise ValueError( "group_size must be divisible by the number of devices.") self._devices = tuple(device_util.canonicalize(d) for d in devices) self._group_size = group_size self._collective_keys = (collective_keys or cross_device_utils.CollectiveKeys()) self._communication = communication # This lock guards all collective launches, i.e. calls to # cross_device_utils.build_collectve_*. # # In a multi threaded eager program we need to ensure different groups of # collectives don't interleave each other, otherwise there couuld be # deadlocks. E.g. if two user threads both are launching collectives: # user-thread-0 device0 device1 # user-thread-1 device0 device1 # In eager mode, we use one executor per device. Executors use single FIFO # queues, so the above launch sequences end up with the following queues: # device-0 collective-0 collective-1 # device-1 collective-1 collective-0 # This deadlocks since neither collective is able to finish. self._lock = threading.Lock() # Collective ops requires all devices to participate and is blocking. In # eager, we need one async executor for each device to be able to launch # them altogether. Note that async doesn't imply concurrency. Within an # async executor operations are still executed sequentially. In graph or # function building, the executors are not used. self._executors = [] for _ in range(len(devices)): self._executors.append(executor.new_executor(enable_async=True)) super(CollectiveAllReduce, self).__init__()
def testDefaultDeviceInsideFunctionWithScope(self, distribution, run_functions_eagerly): def_function.run_functions_eagerly(run_functions_eagerly) expected_device = (device_util.canonicalize("cpu:0") if run_functions_eagerly else "") with distribution.scope(): with ops.device_v2("cpu:0"): @def_function.function def add(): one = array_ops.ones([]) self.assertEqual(expected_device, one.device) return one + 1 add()
def _initialize_local(self, cluster_resolver): """Initialize internal devices for local training.""" worker_device = device_util.canonicalize("/device:CPU:0") self._input_host_device = numpy_dataset.SingleDevice(worker_device) # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) # Save the num_gpus_per_worker for configure method. self._num_gpus_per_worker = num_gpus # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus > 0: compute_devices = tuple( map("/device:GPU:{}".format, range(num_gpus))) else: compute_devices = (_LOCAL_CPU, ) self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if num_gpus == 1: assert len(compute_devices) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = (_LOCAL_GPU_0, ) else: self._variable_device = _LOCAL_CPU self._parameter_devices = (_LOCAL_CPU, ) self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", compute_devices, self._variable_device)
def _initialize_local(self, cluster_resolver): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus)) else: local_devices = ("/device:CPU:0", ) self._worker_device = device_util.canonicalize("/device:CPU:0") self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce. self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) self._cluster_spec = None self._task_type = None self._task_id = None # This is a mark to tell whether we are running with standalone client or # independent worker. Right now with standalone client, strategy object is # created as local strategy and then turn into multi-worker strategy via # configure call. self._local_or_standalone_client_mode = True # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer logging.info("CollectiveAllReduceStrategy with local_devices = %r", local_devices)
def _start_check_health_thread(self): # Allocate group and instance key before starting the thread to avoid # indeterminism. There can only be one thread that assigns group keys and # instance keys, otherwise different workers may end up with unmatched keys # since execution order between threads are arbitrary. device = device_util.canonicalize(self._worker_device) group_key = self._collective_keys.get_group_key([device]) instance_key = self._collective_keys.get_op_instance_key() self._check_health_thread_should_stop = threading.Event() # Start the thread as daemon to avoid it blocking the program from exiting. # We try best to shutdown the thread but __del__ is not guaranteed to be # called when program exists. self._check_health_thread = threading.Thread(target=self._check_health, args=(device, group_key, instance_key), daemon=True) self._check_health_thread.start()
def _make_dataset_iterator(self, dataset): """Make iterator from dataset without splitting the batch. This implementation is different than the one in `tf.distribute.MirroredStrategy` for purposes of backward compatibility. We treat the incoming dataset's batch size as per replica batch size. Args: dataset: `tf.data.Dataset` for input. Returns: An `InputIterator` which returns inputs for each step of the computation. """ if self._local_mode: worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, self._devices)] else: worker_device_pairs = self._worker_devices return values.DatasetIterator(dataset, worker_device_pairs)
def _make_dataset_iterator(self, dataset): """Make iterator from dataset without splitting the batch. This implementation is different than the one in `tf.distribute.MirroredStrategy` for purposes of backward compatibility. We treat the incoming dataset's batch size as per replica batch size. Args: dataset: `tf.data.Dataset` for input. Returns: An `InputIterator` which returns inputs for each step of the computation. """ if self._local_mode: worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, self._devices)] else: worker_device_pairs = self._worker_devices return values.DatasetIterator(dataset, worker_device_pairs)
def testInModelAndCapture(self, source): file_path = os.path.join(self.get_temp_dir(), "text_file_initializer") model = self.Model(source, file_path) func_captures = model.use_table.get_concrete_function( ).graph.external_captures self.assertLen(func_captures, 2) self.assertTrue( any(model.table.resource_handle is t for t in func_captures)) deferred_captures = model.use_table.get_concrete_function( ).graph.deferred_external_captures self.assertEmpty(deferred_captures) strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( self.cluster_resolver) coordinator = coordinator_lib.ClusterCoordinator(strategy) with strategy.scope(): distributed_model = self.Model("value", file_path) func_captures = distributed_model.use_table.get_concrete_function( ).graph.external_captures # One less external_capture, since the table handle becomes a closure in the # deferred_external_capture self.assertLen(func_captures, 1) self.assertFalse( any(model.table.resource_handle is t for t in func_captures)) deferred_captures = distributed_model.use_table.get_concrete_function( ).graph.deferred_external_captures self.assertNotEmpty(deferred_captures) # assert capturing a worker-local resource on each worker for worker in coordinator._cluster.workers: with coordinator_context.with_dispatch_context(worker): for capture in [ t for t in distributed_model.use_table. get_concrete_function().captured_inputs if t.dtype == dtypes.resource ]: if capture.dtype == dtypes.resource: self.assertEqual( capture.device, device_util.canonicalize( "/CPU:0", default=worker.device_name))
def testDefaultDeviceInsideFunctionWithScope( self, distribution, run_functions_eagerly): def_function.run_functions_eagerly(run_functions_eagerly) try: worker = distribution.extended.worker_devices[0] except RuntimeError: worker = None expected_device = (device_util.canonicalize("cpu:0", worker) if run_functions_eagerly else "") with distribution.scope(): with ops.device_v2("cpu:0"): @def_function.function def add(): one = array_ops.ones([]) self.assertEqual(expected_device, one.device) return one + 1 add()
def handle(self): if values_util.is_saving_non_distributed(): return self._primary_handle tpu_context = tpu_util.enclosing_tpu_context() if tpu_context and not context.executing_eagerly(): is_mirrored = (self._variables[0].synchronization != variables_lib.VariableSynchronization.ON_READ) if self._packed_handle is None: handles = [v.handle for v in self._variables] is_packed = False else: handles = [self._packed_handle] is_packed = True return tpu_context.get_replicated_var_handle( self._unique_id, handles, is_mirrored, is_packed) if self._packed_handle is not None and not context.executing_eagerly(): return self._packed_handle device = device_util.canonicalize(device_util.current()) return self._device_to_handle.get(device, self._primary_handle)
def _initialize_local(self, cluster_resolver): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus)) else: local_devices = ("/device:CPU:0",) self._worker_device = device_util.canonicalize("/device:CPU:0") self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce. self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) self._cluster_spec = None self._task_type = None self._task_id = None # This is a mark to tell whether we are running with standalone client or # independent worker. Right now with standalone client, strategy object is # created as local strategy and then turn into multi-worker strategy via # configure call. self._local_or_standalone_client_mode = True # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer logging.info("CollectiveAllReduceStrategy with local_devices = %r", local_devices)
def _make_input_fn_iterator( self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): input_contexts = [] if self._local_mode: num_workers = 1 worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, self._devices)] else: num_workers = len(self._worker_devices) worker_device_pairs = self._worker_devices for i in range(num_workers): input_contexts.append(distribute_lib.InputContext( num_input_pipelines=num_workers, input_pipeline_id=i, num_replicas_in_sync=self._num_replicas_in_sync)) return values.InputFunctionIterator( input_fn, worker_device_pairs, input_contexts)
def _initialize_local(self, compute_devices, parameter_device, cluster_resolver=None): """Initialize internal devices for local training.""" worker_device = device_util.canonicalize("/device:CPU:0") self._input_host_device = numpy_dataset.SingleDevice(worker_device) if compute_devices is None: if not cluster_resolver: num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) # Save the num_gpus_per_worker for configure method which is used by the # contrib version. self._num_gpus_per_worker = num_gpus compute_devices = device_util.local_devices_from_num_gpus(num_gpus) if parameter_device is None: # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if len(compute_devices) == 1: parameter_device = compute_devices[0] else: parameter_device = _LOCAL_CPU self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) self._variable_device = parameter_device self._parameter_devices = (parameter_device,) self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", compute_devices, self._variable_device)
def __init__(self, devices, group_size, collective_keys=None, communication=CollectiveCommunication.AUTO): """Initializes the object. Args: devices: a list of device strings to run collectives on. group_size: the global group size. For between-graph replicated training it's the total number of devices across all workers. collective_keys: an optional CollectiveKey object. communication: indicates which collective communication to use. """ if group_size % len(devices) > 0: raise ValueError("group_size must be divisible by the number of devices.") self._devices = tuple(device_util.canonicalize(d) for d in devices) self._group_size = group_size self._collective_keys = (collective_keys or cross_device_utils.CollectiveKeys()) self._communication = communication # In a multi threaded eager program we need to ensure different groups of # collectives don't interleave each other, otherwise there will be deadlock. self._lock = threading.Lock() # Collective ops requires all devices to participate and is blocking. In # eager, we need one async executor for each device to be able to launch # them altogether. Note that async doesn't imply concurrency. Within an # async executor operations are still executed sequentially. In graph or # function building, the executors are not used. self._executors = [] for _ in range(len(devices)): self._executors.append(executor.new_executor(enable_async=True)) super(CollectiveAllReduce, self).__init__()
def handle(self): if values_util.is_saving_non_distributed(): return self._primary_handle tpu_context = tpu_util.enclosing_tpu_context() if tpu_context and not context.executing_eagerly(): is_mirrored = (self._variables[0].synchronization != variables_lib.VariableSynchronization.ON_READ) if self._packed_handle is None: handles = [v.handle for v in self._variables] is_packed = False else: handles = [self._packed_handle] is_packed = True common_name = self._handle_name # BaseResourceVariable appends ":0" to the handle name, which makes it not # a valid root scope name. if ":" in common_name: common_name = common_name.split(":")[0] return tpu_context.get_replicated_var_handle( common_name, self._unique_id, handles, is_mirrored, is_packed) if self._packed_handle is not None and not context.executing_eagerly(): return self._packed_handle device = device_util.canonicalize(device_util.current()) return self._device_to_handle.get(device, self._primary_handle)
def _make_dataset_iterator(self, dataset): """Make iterator from dataset without splitting the batch.""" worker = device_util.canonicalize("/device:CPU:0") worker_device_pairs = [(worker, [self._device])] return values.DatasetIterator(dataset, worker_device_pairs)
def model_fn(): if 'CPU' in compute_device: replica_compute_device = '/device:CPU:0' else: replica_id = _get_replica_id_integer() replica_compute_device = ('/device:GPU:%d' % replica_id) replica_compute_device = device_util.canonicalize( replica_compute_device) if 'CPU' in variable_device: replica_variable_device = '/device:CPU:0' else: replica_id = _get_replica_id_integer() replica_variable_device = ('/device:GPU:%d' % replica_id) replica_variable_device = device_util.canonicalize( replica_variable_device) a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b self.assertEqual(a.device, replica_compute_device) self.assertEqual(b.device, replica_compute_device) self.assertEqual(c.device, replica_compute_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/device:GPU:2'): x = variable_scope.get_variable( 'x', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) x_add = x.assign_add(c) e = a + c self.assertEqual( device_util.canonicalize(x.device), replica_variable_device) self.assertEqual(x_add.device, x.device) self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2')) # The colocate_vars_with can override the distribution's device. with d.extended.colocate_vars_with(x): y = variable_scope.get_variable( 'y', initializer=20.0, aggregation=variable_scope.VariableAggregation.SUM) # We add an identity here to avoid complaints about summing # non-distributed values. y_add = y.assign_add(array_ops.identity(x_add)) self.assertEqual( device_util.canonicalize(y.device), replica_variable_device) self.assertEqual(y_add.device, y.device) self.assertEqual(y.device, x.device) z = variable_scope.get_variable( 'z', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) self.assertEqual( device_util.canonicalize(z.device), replica_variable_device) with ops.control_dependencies([y_add]): # We add an identity here to avoid complaints about summing # non-distributed values. z_add = z.assign_add(array_ops.identity(y)) with ops.control_dependencies([z_add]): f = z + c self.assertEqual(f.device, replica_compute_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1')) # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. with ops.colocate_with(x): u = variable_scope.get_variable('u', initializer=30.0) h = f + 1.0 self.assertEqual( device_util.canonicalize(u.device), replica_variable_device) self.assertEqual( device_util.canonicalize(x.device), device_util.canonicalize(h.device)) return y_add, z_add, f