def _reduce(self, aggregation, value, destinations): graph = ops.get_default_graph() cf_context = graph._get_control_flow_context() # pylint: disable=protected-access # If we're inside the ReplicateContext, reduction should be done using # CrossReplicaSum while outside we can directly use an add_n op. while cf_context: if isinstance(cf_context, tpu.TPUReplicateContext): if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) return tpu_ops.cross_replica_sum(value) cf_context = cf_context.outer_context # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize( devices[0]) == device_util.canonicalize(self._host) else: raise ValueError( 'Multiple devices are not supported for TPUStrategy') output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def __call__(self, op): if get_tf_version_tuple() >= (1, 8): from tensorflow.python.training.device_util import canonicalize else: def canonicalize(name): # tensorflow/tensorflow#11484 return tf.DeviceSpec.from_string(name).to_string() if op.device: return op.device if op.type not in ['Variable', 'VariableV2']: return canonicalize(self.worker_device) device_index, _ = min(enumerate(self.ps_sizes), key=operator.itemgetter(1)) device_name = self.ps_devices[device_index] var_size = op.outputs[0].get_shape().num_elements() if var_size is None: logger.warn( "[LeastLoadedDeviceSetter] Shape of variable {} is not fully defined!" .format(op.name)) var_size = 0 self.ps_sizes[device_index] += var_size return canonicalize(device_name)
def _reduce(self, aggregation, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) elif aggregation != vs.VariableAggregation.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self.get_host_cpu_device(0)) else: raise ValueError('Multiple devices are not supported for TPUStrategy') if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER: return value[0] output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def _reduce(self, aggregation, value, destinations): graph = ops.get_default_graph() cf_context = graph._get_control_flow_context() # pylint: disable=protected-access # If we're inside the ReplicateContext, reduction should be done using # CrossReplicaSum while outside we can directly use an add_n op. while cf_context: if isinstance(cf_context, tpu.TPUReplicateContext): if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) return tpu_ops.cross_replica_sum(value) cf_context = cf_context.outer_context # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host) else: raise ValueError('Multiple devices are not supported for TPUStrategy') output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def _reduce(self, aggregation, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) elif aggregation != vs.VariableAggregation.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize( devices[0]) == device_util.canonicalize( self.get_host_cpu_device(0)) else: raise ValueError( 'Multiple devices are not supported for TPUStrategy') if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER: return value[0] output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def testCanonicalizeWithoutDefaultDevice(self): self.assertEqual(device_util.canonicalize("/cpu:0"), "/replica:0/task:0/device:CPU:0") self.assertEqual(device_util.canonicalize("/job:worker/cpu:0"), "/job:worker/replica:0/task:0/device:CPU:0") self.assertEqual(device_util.canonicalize("/job:worker/task:1/cpu:0"), "/job:worker/replica:0/task:1/device:CPU:0")
def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize( devices[0]) == device_util.canonicalize(self._host_device) else: raise ValueError( "Multiple devices are not supported for TPUStrategy") output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) return output
def _get_devices_from(destinations): if isinstance(destinations, value_lib.DistributedValues): return list(destinations.devices) elif isinstance(destinations, six.string_types): return [device_util.canonicalize(destinations)] else: return [ device_util.canonicalize(destination) for destination in destinations ]
def _get_devices_from(destinations): if isinstance(destinations, value_lib.DistributedValues): return list(destinations.devices) elif isinstance(destinations, six.string_types): return [device_util.canonicalize(destinations)] else: return [ device_util.canonicalize(destination) for destination in destinations ]
def testCanonicalizeWithDefaultDevice(self): self.assertEqual( device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"), "/job:worker/replica:0/task:1/device:CPU:0") self.assertEqual( device_util.canonicalize("/job:worker/task:1", default="/gpu:0"), "/job:worker/replica:0/task:1/device:GPU:0") self.assertEqual( device_util.canonicalize("/cpu:0", default="/job:worker"), "/job:worker/replica:0/task:0/device:CPU:0")
def testCanonicalizeWithoutDefaultDevice(self): self.assertEqual( device_util.canonicalize("/cpu:0"), "/job:localhost/replica:0/task:0/device:CPU:0") self.assertEqual( device_util.canonicalize("/job:worker/cpu:0"), "/job:worker/replica:0/task:0/device:CPU:0") self.assertEqual( device_util.canonicalize("/job:worker/task:1/cpu:0"), "/job:worker/replica:0/task:1/device:CPU:0")
def __init__(self, devices=None, num_gpus=None, cross_tower_ops=None, prefetch_on_device=None): super(MirroredStrategy, self).__init__() # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = devices self._canonical_device_set = set( [device_util.canonicalize(d) for d in devices]) self._device_index = values.PerDevice( dict((d, i) for i, d in enumerate(devices))) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device
def __init__(self, devices=None, num_gpus=None, cross_tower_ops=None, prefetch_on_device=None): super(MirroredStrategy, self).__init__() # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError("Must only specify one of `devices` and `num_gpus`.") assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = devices self._canonical_device_set = set( [device_util.canonicalize(d) for d in devices]) self._device_index = values.PerDevice( dict((d, i) for i, d in enumerate(devices))) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, "/device:CPU:0")] for worker in self._workers } devices = nest.flatten(self._worker_device_map) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)})
def _initialize_multi_worker(self, num_gpus, cluster_spec): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._cluster_spec = cluster_spec self._workers = [] for job in ["chief", "worker"]: for task in range(len(cluster_spec.as_dict().get(job, []))): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError("`num_gpus` is required if `cluster_spec` is given.") if num_gpus > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, "/device:CPU:0")] for worker in self._workers } devices = nest.flatten(self._worker_device_map) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0] assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)})
def _is_per_device(self, result, expected, klass=values.PerDevice): self.assertIsInstance(result, klass) # We canonicalize the devices to match the device strings returned # by PerDevice, which also does device string canonicalization. devices = [device_util.canonicalize(_device_str(i)) for i in range(len(expected))] self.assertEqual(set(devices), set(result.devices)) for i, d in enumerate(devices): self.assertEqual(expected[i], result.get(d)) self.assertEqual(expected[i], result.get(_device_str(i)))
def _is_per_device(self, result, expected, klass=values.PerDevice): self.assertIsInstance(result, klass) # We canonicalize the devices to match the device strings returned # by PerDevice, which also does device string canonicalization. devices = [device_util.canonicalize(_device_str(i)) for i in range(len(expected))] self.assertEqual(set(devices), set(result.devices)) for i, d in enumerate(devices): self.assertEqual(expected[i], result.get(d)) self.assertEqual(expected[i], result.get(_device_str(i)))
def choose_the_best(devices, session_config=None): """Find the best subclass of CrossTowerOps given a tensorflow session. Args: devices: a list of devices passed for distribute strategy. session_config: a tensorflow session config or None. If None, it will make deciesion based on all local devices. Returns: a subclass of CrossTowerOps. """ requested_devices = set([device_util.canonicalize(d) for d in devices]) machine_devices = device_lib.list_local_devices( session_config=session_config) using_devices = [] for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.append(d) else: logging.info( "Device is available but not used by distribute strategy: %s", d.name) if len(using_devices) != len(requested_devices): logging.warning( "Not all devices in distribute strategy are visible by " "TensorFlow sessions.") return ReductionToOneDeviceCrossTowerOps() if any([d.device_type.lower() != "gpu" for d in using_devices]): logging.warning( "Not all devices in DistributionStrategy are visible to " "TensorFlow session.") return ReductionToOneDeviceCrossTowerOps() device_links = [[] for _ in range(len(using_devices))] for i, device in enumerate(using_devices): for link in device.locality.links.link: device_links[i].append(link.device_id) return _choose_all_reduce_algorithm(device_links)
def _call_and_check(self, model_fn, inputs, expected_result, defuns, two_variables=False): cpu_dev = device_util.canonicalize("CPU:0") gpu_dev = device_util.canonicalize("GPU:0") devices = [cpu_dev, gpu_dev] dist = mirrored_strategy.MirroredStrategy(devices) with dist.scope(): mock_model = MockModel(two_variables) self.evaluate(variables.global_variables_initializer()) result = dist.call_for_each_tower(model_fn, mock_model, *inputs, run_concurrently=False) for device in devices: device_result = values.select_device(device, result) device_expected_result = values.select_device(device, expected_result) self.assertAllClose(device_expected_result, self.evaluate(device_result)) for defun in defuns: self.assertEqual(set(mock_model.variables), set(defun.variables))
def _call_and_check(self, model_fn, inputs, expected_result, defuns, two_variables=False): cpu_dev = device_util.canonicalize("CPU:0") gpu_dev = device_util.canonicalize("GPU:0") devices = [cpu_dev, gpu_dev] dist = mirrored_strategy.MirroredStrategy(devices) with dist.scope(): mock_model = MockModel(two_variables) self.evaluate(variables.global_variables_initializer()) result = dist.call_for_each_tower(model_fn, mock_model, *inputs, run_concurrently=False) for device in devices: device_result = values.select_device(device, result) device_expected_result = values.select_device(device, expected_result) self.assertAllClose(device_expected_result, self.evaluate(device_result)) for defun in defuns: self.assertEqual(set(mock_model.variables), set(defun.variables))
def __call__(self, op): if get_tf_version_tuple() >= (1, 8): from tensorflow.python.training.device_util import canonicalize else: def canonicalize(name): # tensorflow/tensorflow#11484 return tf.DeviceSpec.from_string(name).to_string() if op.device: return op.device if op.type not in ['Variable', 'VariableV2']: return canonicalize(self.worker_device) device_index, _ = min(enumerate( self.ps_sizes), key=operator.itemgetter(1)) device_name = self.ps_devices[device_index] var_size = op.outputs[0].get_shape().num_elements() if var_size is None: logger.warn("[LeastLoadedDeviceSetter] Shape of variable {} is not fully defined!".format(op.name)) var_size = 0 self.ps_sizes[device_index] += var_size return canonicalize(device_name)
def choose_the_best(devices, session_config=None): """Find the best subclass of CrossTowerOps given a tensorflow session. Args: devices: a list of devices passed for distribute strategy. session_config: a tensorflow session config or None. If None, it will make deciesion based on all local devices. Returns: a subclass of CrossTowerOps. """ requested_devices = set([device_util.canonicalize(d) for d in devices]) machine_devices = device_lib.list_local_devices(session_config=session_config) using_devices = [] for d in machine_devices: if device_util.canonicalize(d.name) in requested_devices: using_devices.append(d) else: logging.info( "Device is available but not used by distribute strategy: %s", d.name) if len(using_devices) != len(requested_devices): logging.warning("Not all devices in distribute strategy are visible by " "TensorFlow sessions.") return ReductionToOneDeviceCrossTowerOps() if any([d.device_type.lower() != "gpu" for d in using_devices]): logging.warning("Not all devices in DistributionStrategy are visible to " "TensorFlow session.") return ReductionToOneDeviceCrossTowerOps() device_links = [[] for _ in range(len(using_devices))] for i, device in enumerate(using_devices): for link in device.locality.links.link: device_links[i].append(link.device_id) return _choose_all_reduce_algorithm(device_links)
def _reduce_to(self, reduce_op, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if reduce_op == reduce_util.ReduceOp.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self._num_replicas_in_sync) elif reduce_op != reduce_util.ReduceOp.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host_device) else: raise ValueError("Multiple devices are not supported for TPUStrategy") output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) return output
def get(self, device=None): """Returns the value for the current device or raises a ValueError.""" if device is None: tower_context = distribute_lib.get_tower_context() if tower_context: device = tower_context.device else: device = distribute_lib.get_update_device() if device is None: device = device_util.current() device = device_util.canonicalize(device) try: return self._index[device] except KeyError: raise ValueError("Device %s not found in %s (current device %s)" % (device, self._index.keys(), device_util.current()))
def on_device(self, device): device = device_util.canonicalize(device) return device in self._index
def _get_cross_tower(self): device = device_util.canonicalize(device_util.current()) if device in self._index: return self._index[device] return list(self._index.values())[0]
def _get_cross_tower(self): device = device_util.canonicalize(device_util.current()) if device in self._index: return self._index[device] return list(self._index.values())[0]
def on_device(self, device): device = device_util.canonicalize(device) return device in self._index
def __init__(self, index): self._index = {device_util.canonicalize(key): value for key, value in six.iteritems(index)}
def __init__(self, devices=None, num_gpus=None, cluster_spec=None, cross_tower_ops=None, prefetch_on_device=None): super(MirroredStrategy, self).__init__() if cluster_spec: if devices is not None: raise ValueError( "Specifying devices when `cluster_spec` is also given " "is not supported in MirroredStrategy.") # TODO(yuefengz): use the utility method to normalize cluster_spec. if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)): cluster_spec = server_lib.ClusterSpec(cluster_spec) elif not isinstance(cluster_spec, server_lib.ClusterSpec): raise ValueError( "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " "`tf.train.ClusterDef` object") self._cluster_spec = cluster_spec self._workers = [] for job in sorted(cluster_spec.jobs): for task in range(cluster_spec.num_tasks(job)): self._workers.append("/job:%s/task:%d" % (job, task)) if num_gpus is None: raise ValueError( "`num_gpus` is required if `cluster_spec` is given.") self._num_gpus = num_gpus if num_gpus > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + "/device:GPU:%d" % gpu) for gpu in range(num_gpus) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, "/device:CPU:0")] for worker in self._workers } devices = nest.flatten(self._worker_device_map) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0] else: self._cluster_spec = None # Convert `num_gpus` into `devices`, shouldn't specify both. if devices is None: if num_gpus is None: num_gpus = context.num_gpus() devices = ["/device:GPU:%d" % d for d in range(num_gpus)] elif num_gpus is not None: raise ValueError( "Must only specify one of `devices` and `num_gpus`.") # TODO(yuefengz): consider setting the default device. assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( {d: i for i, d in enumerate(devices)}) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device
def __init__(self, num_gpus_per_worker=1, worker_job_name=None, num_workers=None, cluster=None, cross_tower_ops=None, prefetch_on_device=None): """Initialize the strategy object. Args: num_gpus_per_worker: number of GPUs per work. If it is zero, the local CPU will be used. worker_job_name: the job name for `worker`, typically just 'worker'. num_workers: the number of workers. If it is 0, it regenerates to single-worker MirroredStrategy. cluster: a `tf.train.ClusterSpec` object or a dict that can be used to construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef` proto buffer. It is an alternative way to initialize this object. cross_tower_ops: the cross tower ops to use. If None, a default one will be used. If configure method is called, a best one for the configuration will be chosen. prefetch_on_device: a boolean to specify whether to prefetech input to each worker's devices. Raises: ValueError: if got an unexpected `cluster`. """ if cluster is None: self._workers = [ '/job:%s/task:%d' % (worker_job_name, task_index) for task_index in range(num_workers) ] else: if isinstance(cluster, (dict, cluster_pb2.ClusterDef)): cluster_spec = server_lib.ClusterSpec(cluster) elif isinstance(cluster, server_lib.ClusterSpec): cluster_spec = cluster else: raise ValueError( "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " '`tf.train.ClusterDef` object') self._workers = [] for job in sorted(cluster_spec.jobs): for task in range(cluster_spec.num_tasks(job)): self._workers.append('/job:%s/task:%d' % (job, task)) self._num_gpus_per_worker = num_gpus_per_worker if num_gpus_per_worker > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + '/device:GPU:%d' % gpu) for gpu in range(num_gpus_per_worker) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, '/device:CPU:0')] for worker in self._workers } self._devices = nest.flatten(self._worker_device_map) super(MultiWorkerMirroredStrategy, self).__init__( devices=self._devices, prefetch_on_device=prefetch_on_device) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0]
def _get_cross_tower(self): device = device_util.canonicalize(device_util.current()) if device in self._index: return array_ops.identity(self._index[device]) return array_ops.identity(self._primary_var)
def __init__(self, index): self._index = { device_util.canonicalize(key): value for key, value in six.iteritems(index) }
def model_fn(): if 'CPU' in compute_device: tower_compute_device = '/device:CPU:0' else: tower_compute_device = ( '/device:GPU:%d' % distribution_strategy_context.get_tower_context().tower_id) tower_compute_device = device_util.canonicalize(tower_compute_device) if 'CPU' in variable_device: tower_variable_device = '/device:CPU:0' else: tower_variable_device = ( '/device:GPU:%d' % distribution_strategy_context.get_tower_context().tower_id) tower_variable_device = device_util.canonicalize(tower_variable_device) a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b self.assertEqual(a.device, tower_compute_device) self.assertEqual(b.device, tower_compute_device) self.assertEqual(c.device, tower_compute_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/device:GPU:2'): x = variable_scope.get_variable( 'x', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) x_add = x.assign_add(c) e = a + c self.assertEqual( device_util.canonicalize(x.device), tower_variable_device) self.assertEqual(x_add.device, x.device) self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2')) # The colocate_vars_with can override the distribution's device. with d.colocate_vars_with(x): y = variable_scope.get_variable( 'y', initializer=20.0, aggregation=variable_scope.VariableAggregation.SUM) # We add an identity here to avoid complaints about summing # non-distributed values. y_add = y.assign_add(array_ops.identity(x_add)) self.assertEqual( device_util.canonicalize(y.device), tower_variable_device) self.assertEqual(y_add.device, y.device) self.assertEqual(y.device, x.device) z = variable_scope.get_variable( 'z', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) self.assertEqual( device_util.canonicalize(z.device), tower_variable_device) with ops.control_dependencies([y_add]): # We add an identity here to avoid complaints about summing # non-distributed values. z_add = z.assign_add(array_ops.identity(y)) with ops.control_dependencies([z_add]): f = z + c self.assertEqual(f.device, tower_compute_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1')) # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. with ops.colocate_with(x): u = variable_scope.get_variable('u', initializer=30.0) h = f + 1.0 self.assertEqual( device_util.canonicalize(u.device), tower_variable_device) self.assertEqual(device_util.canonicalize(x.device), h.device) return y_add, z_add, f
def _devices_match(d1, d2): return device_util.canonicalize(d1) == device_util.canonicalize(d2)
def _devices_match(d1, d2): return device_util.canonicalize(d1) == device_util.canonicalize(d2)
def _get_cross_tower(self): device = device_util.canonicalize(device_util.current()) if device in self._index: return array_ops.identity(self._index[device]) return array_ops.identity(self._primary_var)