def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run, num_cores=None): super(TPUExtended, self).__init__(container_strategy) self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) # TODO(sourabhbajaj): Change this from num_cores to metadata_override self._num_cores_override = num_cores # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. device_map = { d.name: i for i, d in enumerate(self._tpu_metadata.devices) if "device:TPU:" in d.name } self._device_index = values.PerReplica(device_map) self._host_device = self.get_host_cpu_device(0) self._tpu_devices = sorted(device_map.keys()) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True
def _initialize_local(self, num_gpus, devices): """Initializes the object for local training.""" self._cluster_spec = None # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() if num_gpus == 0: devices = ["/device:CPU:0"] else: devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") self._num_gpus = num_gpus # TODO(yuefengz): consider setting the default device. assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)})
def testContainsIndexedSlices_PerReplica(self): t0 = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1}) self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
def map(self, map_over, fn, *args, **kwargs): # TODO(josh11b): In eager mode, use one thread per device. index = {} for i, m in enumerate(map_over): d = self._devices[i % len(self._devices)] with ops.device(d): l = index.get(d, []) l.append( fn(m, *values.select_device_mirrored(d, args), **values.select_device_mirrored(d, kwargs))) index[d] = l # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput # in addition to PerReplica data. return values.PerReplica( {k: values.MapOutput(v) for k, v in index.items()})
def _make_tensor_into_per_replica(input_tensor): """Converts a single tensor into a PerReplica object.""" if isinstance(input_tensor, (tuple, list)): raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object, " "got %r but expected a object that is not a tuple or list." % (input_tensor,)) if isinstance(input_tensor, value_lib.PerReplica): return input_tensor try: device = input_tensor.device except AttributeError: raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object " "because it doesn't have device set.") return value_lib.PerReplica({device: input_tensor})
def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1}) result = cross_tower_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, vs.VariableAggregation.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result)
def _make_per_replica(values, devices, regroup=False): devices = cross_tower_ops_lib.get_devices_from(devices) assert len(values) == len(devices) # We simulate the result of regroup called on PerReplica which strips the # PerReplica wrapper if it has only one value. if len(values) == 1 and regroup: with ops.device(devices[0]): placed_v = array_ops.identity(values[0]) return placed_v index = {} for d, v in zip(devices, values): with ops.device(d): placed_v = array_ops.identity(v) index[d] = placed_v return value_lib.PerReplica(index)
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_devices = [(worker, [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ]) for worker in self._workers] else: self._worker_devices = [ (worker, [device_util.canonicalize(worker, "/device:CPU:0")]) for worker in self._workers ] devices = nest.flatten([l for _, l in self._worker_devices]) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)})
def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] dense_shape = [5, 2] t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1]) per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1}) if batch_reduce: result = cross_tower_ops_instance.batch_reduce( reduce_op, [(per_replica, devices)]) else: result = cross_tower_ops_instance.reduce(reduce_op, per_replica, devices) total_indices_with_dups = [1, 1, 3] total_indices_without_dups = [1, 3] if reduce_op == reduce_util.ReduceOp.SUM: total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]] total_values_without_dups = [[4., 6.], [5., 6.]] else: assert reduce_op == reduce_util.ReduceOp.MEAN total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]] total_values_without_dups = [[2., 3.], [2.5, 3.]] total_mirrored_with_dups = _make_mirrored_indexed_slices( devices, total_values_with_dups, total_indices_with_dups, dense_shape) total_mirrored_without_dups = _make_mirrored_indexed_slices( devices, total_values_without_dups, total_indices_without_dups, dense_shape) # Test that the result is semantically equal to both the concatenated # IndexedSlices, as well as when the duplicate indices are summed up. if batch_reduce: total_mirrored_with_dups = [total_mirrored_with_dups] total_mirrored_without_dups = [total_mirrored_without_dups] self._assert_values_equal(total_mirrored_with_dups, result) self._assert_values_equal(total_mirrored_without_dups, result)
def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None): """Initializes the TPUStrategy object. Args: tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. steps_per_run: Number of steps to run on device before returning to the host. Note that this can have side-effects on performance, hooks, metrics, summaries etc. This parameter is only used when Distribution Strategy is used with estimator or keras. num_cores: Number of cores to use on the TPU. If None specified, then auto-detect the cores and topology of the TPU system. """ super(TPUStrategy, self).__init__() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) # TODO(sourabhbajaj): Change this from num_cores to metadata_override self._num_cores_override = num_cores # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. device_map = { d.name: i for i, d in enumerate(self._tpu_metadata.devices) if "device:TPU:" in d.name } self._device_index = values.PerReplica(device_map) self._host_device = self.get_host_cpu_device(0) self._tpu_devices = sorted(device_map.keys()) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self.num_replicas_in_sync] # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True