def testPassPerDevice(self): self._skip_eager_if_gpus_less_than(1) @function.defun def fn1(mock_model, factor): return mock_model(factor) factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0}) expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25, "GPU:0": 3.0 * 1.25}) self._call_and_check(fn1, [factors], expected_result, [fn1])
def __init__(self, devices=None, num_gpus=None, cross_tower_ops=None, prefetch_on_device=None): super(MirroredStrategy, self).__init__() # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = devices self._canonical_device_set = set( [device_util.canonicalize(d) for d in devices]) self._device_index = values.PerDevice( dict((d, i) for i, d in enumerate(devices))) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device
def _initialize_local(self, num_gpus, devices): """Initializes the object for local training.""" self._cluster_spec = None # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() if num_gpus == 0: devices = ["/device:CPU:0"] else: devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") self._num_gpus = num_gpus # TODO(yuefengz): consider setting the default device. assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)})
def _reduce(self, method_string, value, destinations): if len(self._devices) == 1 and not isinstance(value, values.PerDevice): value = values.PerDevice({self._devices[0]: value}) assert isinstance(value, values.PerDevice) return self._get_cross_tower_ops().reduce( method_string, value, destinations=destinations)
def testContainsIndexedSlices_PerDevice(self): t0 = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1}) self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
def _make_per_device(values, devices): devices = cross_tower_ops_lib._get_devices_from(devices) assert len(values) == len(devices) index = {} for d, v in zip(devices, values): with ops.device(d): placed_v = array_ops.identity(v) index[d] = placed_v return value_lib.PerDevice(index)
def map(self, map_over, fn, *args, **kwargs): # TODO(josh11b): In eager mode, use one thread per device. index = {} for i, m in enumerate(map_over): d = self._devices[i % len(self._devices)] with ops.device(d): l = index.get(d, []) l.append(fn(m, *values.select_device_mirrored(d, args), **values.select_device_mirrored(d, kwargs))) index[d] = l # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput # in addition to PerDevice data. return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1}) result = cross_tower_ops_lib._simple_reduce(per_device, devices[0], math_ops.add_n, "sum") # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result)
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, "/device:CPU:0")] for worker in self._workers } devices = nest.flatten(self._worker_device_map) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)})
def _make_per_device(values, devices, regroup=False): devices = cross_tower_ops_lib.get_devices_from(devices) assert len(values) == len(devices) # We simulate the result of regroup called on PerDevice which strips the # PerDevice wrapper if it has only one value. if len(values) == 1 and regroup: with ops.device(devices[0]): placed_v = array_ops.identity(values[0]) return placed_v index = {} for d, v in zip(devices, values): with ops.device(d): placed_v = array_ops.identity(v) index[d] = placed_v return value_lib.PerDevice(index)
def _make_tensor_into_per_device(input_tensor): """Converts a single tensor into a PerDevice object.""" if isinstance(input_tensor, (tuple, list)): raise ValueError( "Cannot convert `input_tensor` to a `PerDevice` object, " "got %r but expected a object that is not a tuple or list." % (input_tensor, )) if isinstance(input_tensor, value_lib.PerDevice): return input_tensor try: device = input_tensor.device except AttributeError: raise ValueError( "Cannot convert `input_tensor` to a `PerDevice` object " "because it doesn't have device set.") return value_lib.PerDevice({device: input_tensor})
def map(self, map_over, fn, *args, **kwargs): # TODO (josh11b): In eager mode, use one thread per device. id:1098 # https://github.com/imdone/tensorflow/issues/1099 index = {} i = 0 for m in map_over: d = self._devices[i % len(self._devices)] with ops.device(d): l = index.get(d, []) l.append( fn(m, *values.select_device_mirrored(d, args), **values.select_device_mirrored(d, kwargs))) index[d] = l # TODO (josh11b): Need a values.regroup equivalent that handles MapOutput id:1079 # https://github.com/imdone/tensorflow/issues/1080 # in addition to PerDevice data. return values.PerDevice( {k: values.MapOutput(v) for k, v in index.items()})
def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation, batch_reduce): devices = ["/cpu:0", "/gpu:0"] dense_shape = [5, 2] t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1]) per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1}) if batch_reduce: result = cross_tower_ops_instance.batch_reduce( aggregation, [(per_device, devices)]) else: result = cross_tower_ops_instance.reduce(aggregation, per_device, devices) total_indices_with_dups = [1, 1, 3] total_indices_without_dups = [1, 3] if aggregation == vs.VariableAggregation.SUM: total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]] total_values_without_dups = [[4., 6.], [5., 6.]] else: assert aggregation == vs.VariableAggregation.MEAN total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]] total_values_without_dups = [[2., 3.], [2.5, 3.]] total_mirrored_with_dups = _make_mirrored_indexed_slices( devices, total_values_with_dups, total_indices_with_dups, dense_shape) total_mirrored_without_dups = _make_mirrored_indexed_slices( devices, total_values_without_dups, total_indices_without_dups, dense_shape) # Test that the result is semantically equal to both the concatenated # IndexedSlices, as well as when the duplicate indices are summed up. if batch_reduce: total_mirrored_with_dups = [total_mirrored_with_dups] total_mirrored_without_dups = [total_mirrored_without_dups] self._assert_values_equal(total_mirrored_with_dups, result) self._assert_values_equal(total_mirrored_without_dups, result)
def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None): """Initializes the TPUStrategy object. Args: tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. steps_per_run: Number of steps to run on device before returning to the host. Note that this can have side-effects on performance, hooks, metrics, summaries etc. This parameter is only used when Distribution Strategy is used with estimator or keras. num_cores: Number of cores to use on the TPU. If None specified, then auto-detect the cores and topology of the TPU system. """ # TODO(sourabhbajaj): OneDeviceStrategy should be initialized with the # master node fetched from the cluster resolver. super(TPUStrategy, self).__init__("/device:CPU:0") self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) # TODO(sourabhbajaj): Change this from num_cores to metadata_override self._num_cores_override = num_cores # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. device_map = { d.name: i for i, d in enumerate(self._tpu_metadata.devices) if "device:TPU:" in d.name } self._device_index = values.PerDevice(device_map) self._tpu_devices = sorted(device_map.keys()) # Only create variables for the number of towers we're running. self._tpu_devices = self._tpu_devices[:self.num_towers] # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True
def __init__(self, devices=None, num_gpus=None, cluster_spec=None, cross_tower_ops=None, prefetch_on_device=None): super(MirroredStrategy, self).__init__() if cluster_spec: if devices is not None: raise ValueError( "Specifying devices when `cluster_spec` is also given " "is not supported in MirroredStrategy.") # TODO(yuefengz): use the utility method to normalize cluster_spec. if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)): cluster_spec = server_lib.ClusterSpec(cluster_spec) elif not isinstance(cluster_spec, server_lib.ClusterSpec): raise ValueError( "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " "`tf.train.ClusterDef` object") self._cluster_spec = cluster_spec self._workers = [] for job in sorted(cluster_spec.jobs): for task in range(cluster_spec.num_tasks(job)): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") self._num_gpus = num_gpus if num_gpus > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, "/device:CPU:0")] for worker in self._workers } devices = nest.flatten(self._worker_device_map) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0] else: self._cluster_spec = None # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") # TODO(yuefengz): consider setting the default device. assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)}) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device