def testCurrentDeviceWithGlobalGraph(self): with ops.device("/cpu:0"): self.assertEqual(device_util.current(), "/device:CPU:0") with ops.device("/job:worker"): with ops.device("/cpu:0"): self.assertEqual(device_util.current(), "/job:worker/device:CPU:0") with ops.device("/cpu:0"): with ops.device("/gpu:0"): self.assertEqual(device_util.current(), "/device:GPU:0")
def _reduce(self, reduce_op, value): self._assert_being_scheduled_by_cluster_coordinator() dst = device_util.current() or self._default_device or "/device:CPU:0" destinations = device_util.canonicalize_without_job_and_task(dst) result = self._local_results( self.reduce_to(reduce_op, value, destinations))[0] return result
def _device_scope(self): if (self._packed_handle is None or values_util.is_saving_non_distributed() or tpu_util.enclosing_tpu_context() is not None): return ops.NullContextmanager() device = device_util.canonicalize(device_util.current()) if device in self._device_to_handle: return ops.NullContextmanager() return ops.device(self._primary_handle.device)
def _get_on_device_or_primary(self): """Returns value in same replica or device if possible, else the _primary.""" replica_id = values_util.get_current_replica_id_as_int() if replica_id is None: # Try to find a value on the current device. current_device = device_util.canonicalize(device_util.current()) for value in self._values: if device_util.canonicalize(value.device) == current_device: return value return self._primary else: return self._values[replica_id]
def handle(self): if values_util.is_saving_non_distributed(): return self._primary_handle tpu_context = tpu_util.enclosing_tpu_context() if tpu_context and not context.executing_eagerly(): is_mirrored = (self._variables[0].synchronization != variables_lib.VariableSynchronization.ON_READ) if self._packed_handle is None: handles = [v.handle for v in self._variables] is_packed = False else: handles = [self._packed_handle] is_packed = True return tpu_context.get_replicated_var_handle( self._unique_id, handles, is_mirrored, is_packed) if self._packed_handle is not None and not context.executing_eagerly(): return self._packed_handle device = device_util.canonicalize(device_util.current()) return self._device_to_handle.get(device, self._primary_handle)
def handle(self): if values_util.is_saving_non_distributed(): return self._primary_handle tpu_context = tpu_util.enclosing_tpu_context() if tpu_context and not context.executing_eagerly(): is_mirrored = (self._variables[0].synchronization != variables_lib.VariableSynchronization.ON_READ) if self._packed_handle is None: handles = [v.handle for v in self._variables] is_packed = False else: handles = [self._packed_handle] is_packed = True common_name = self._handle_name # BaseResourceVariable appends ":0" to the handle name, which makes it not # a valid root scope name. if ":" in common_name: common_name = common_name.split(":")[0] return tpu_context.get_replicated_var_handle( common_name, self._unique_id, handles, is_mirrored, is_packed) if self._packed_handle is not None and not context.executing_eagerly(): return self._packed_handle device = device_util.canonicalize(device_util.current()) return self._device_to_handle.get(device, self._primary_handle)
def get_var_on_current_device(self): current_device = device_util.canonicalize(device_util.current()) return self.get_var_on_device(current_device)
def _is_current_device_ipu(): current_device = tf_device.DeviceSpec.from_string(device_util.current()) return current_device.device_type == "IPU"
def connect_to_cluster(cluster_spec_or_resolver, job_name="localhost", task_index=0, protocol=None, make_master_device_default=True): """Connects to the given cluster. Will make devices on the cluster available to use. Note that calling this more than once will work, but will invalidate any tensor handles on the old remote devices. If the given local job name is not present in the cluster specification, it will be automatically added, using an unused port on the localhost. Args: cluster_spec_or_resolver: A `ClusterSpec` or `ClusterResolver` describing the cluster. job_name: The name of the local job. task_index: The local task index. protocol: The communication protocol, such as `"grpc"`. If unspecified, will use the default from `python/platform/remote_utils.py`. make_master_device_default: If True and a cluster resolver is passed, will automatically enter the master task device scope, which indicates the master becomes the default device to run ops. It won't do anything if a cluster spec is passed. Will throw an error if the caller is currently already in some device scope. """ if not context.executing_eagerly(): raise ValueError( "`tf.config.experimental_connect_to_cluster` can only be called in " "eager mode.") protocol = protocol or remote_utils.get_default_communication_protocol() if isinstance(cluster_spec_or_resolver, server_lib.ClusterSpec): cluster_spec = cluster_spec_or_resolver elif isinstance(cluster_spec_or_resolver, cluster_resolver.ClusterResolver): if cluster_spec_or_resolver.master() in _LOCAL_MASTERS: # Do nothing if the master is local. return cluster_spec = cluster_spec_or_resolver.cluster_spec() else: raise ValueError( "`cluster_spec_or_resolver` must be a `ClusterSpec` or a " "`ClusterResolver`.") cluster_def = copy.deepcopy(cluster_spec.as_cluster_def()) # Automatically add local job, if not part of the cluster spec. if job_name not in cluster_spec.jobs: local_port = pywrap_tensorflow.TF_PickUnusedPortOrDie() job_def = cluster_def.job.add() job_def.name = job_name # TODO(fishx): Update this to make sure remote worker has valid ip address # to connect with local. job_def.tasks[0] = "localhost:{}".format(local_port) server_def = ServerDef(cluster=cluster_def, job_name=job_name, task_index=task_index, protocol=protocol, default_session_config=context.context().config) if context.get_server_def() is None: context.set_server_def(server_def) else: context.update_server_def(server_def) if make_master_device_default and isinstance( cluster_spec_or_resolver, cluster_resolver.ClusterResolver ) and cluster_spec_or_resolver.master(): master = cluster_spec_or_resolver.master() master_job_name = None master_task_id = None for job_name in cluster_spec.jobs: for task_id in cluster_spec.task_indices(job_name): task_address = cluster_spec.task_address(job_name, task_id) if master in task_address or task_address in master: master_job_name = job_name master_task_id = task_id break if not master_job_name: raise ValueError( "`make_master_device_default` is set to True but cannot find " "master %s in the cluster" % master) master_device = "/job:{}/replica:0/task:{}".format( master_job_name, master_task_id) master_device = device_util.canonicalize(master_device) current_device = device_util.current() if current_device: current_device = device_util.canonicalize(current_device) if current_device and current_device != master_device: raise ValueError( "`connect_to_cluster` is called inside existing device " "scope %s, which is different from the master device " "scope %s to enter. This is not allowed." % (current_device, master_device)) # TODO(b/138389076): Think of the entering device scope behavior in the # failure recovery case when dealing with preemptions. if not current_device: logging.info("Entering into master device scope: %s", master_device) ops.device(master_device).__enter__()
def connect_to_cluster(cluster_spec_or_resolver, job_name="localhost", task_index=0, protocol=None, make_master_device_default=True, cluster_device_filters=None): """Connects to the given cluster. Will make devices on the cluster available to use. Note that calling this more than once will work, but will invalidate any tensor handles on the old remote devices. If the given local job name is not present in the cluster specification, it will be automatically added, using an unused port on the localhost. Device filters can be specified to isolate groups of remote tasks to avoid undesired accesses between workers. Workers accessing resources or launching ops / functions on filtered remote devices will result in errors (unknown devices). For any remote task, if no device filter is present, all cluster devices will be visible; if any device filter is specified, it can only see devices matching at least one filter. Devices on the task itself are always visible. Device filters can be particially specified. For example, for a cluster set up for parameter server training, the following device filters might be specified: ```python cdf = tf.config.experimental.ClusterDeviceFilters() # For any worker, only the devices on PS nodes and itself are visible for i in range(num_workers): cdf.set_device_filters('worker', i, ['/job:ps']) # Similarly for any ps, only the devices on workers and itself are visible for i in range(num_ps): cdf.set_device_filters('ps', i, ['/job:worker']) tf.config.experimental_connect_to_cluster(cluster_def, cluster_device_filters=cdf) ``` Args: cluster_spec_or_resolver: A `ClusterSpec` or `ClusterResolver` describing the cluster. job_name: The name of the local job. task_index: The local task index. protocol: The communication protocol, such as `"grpc"`. If unspecified, will use the default from `python/platform/remote_utils.py`. make_master_device_default: If True and a cluster resolver is passed, will automatically enter the master task device scope, which indicates the master becomes the default device to run ops. It won't do anything if a cluster spec is passed. Will throw an error if the caller is currently already in some device scope. cluster_device_filters: an instance of `tf.train.experimental/ClusterDeviceFilters` that specify device filters to the remote tasks in cluster. """ if not context.executing_eagerly(): raise ValueError( "`tf.config.experimental_connect_to_cluster` can only be called in " "eager mode.") protocol = protocol or remote_utils.get_default_communication_protocol() if isinstance(cluster_spec_or_resolver, server_lib.ClusterSpec): cluster_spec = cluster_spec_or_resolver elif isinstance(cluster_spec_or_resolver, cluster_resolver.ClusterResolver): if cluster_spec_or_resolver.master() in _LOCAL_MASTERS: # Do nothing if the master is local. return cluster_spec = cluster_spec_or_resolver.cluster_spec() else: raise ValueError( "`cluster_spec_or_resolver` must be a `ClusterSpec` or a " "`ClusterResolver`.") cluster_def = copy.deepcopy(cluster_spec.as_cluster_def()) if cluster_device_filters: if isinstance(cluster_device_filters, server_lib.ClusterDeviceFilters): cluster_device_filters = copy.deepcopy( cluster_device_filters._as_cluster_device_filters()) # pylint: disable=protected-access else: raise ValueError("`cluster_device_filters` must be an instance of " "`tf.train.experimental.ClusterDeviceFilters`.") # Automatically add local job, if not part of the cluster spec. if job_name not in cluster_spec.jobs: local_port = pywrap_tfe.TF_PickUnusedPortOrDie() job_def = cluster_def.job.add() job_def.name = job_name ipstr = _get_local_ip_address(local_port) if ipstr: job_def.tasks[0] = "{}:{}".format(ipstr, local_port) else: job_def.tasks[0] = "localhost:{}".format(local_port) server_def = ServerDef(cluster=cluster_def, job_name=job_name, task_index=task_index, protocol=protocol, default_session_config=context.context().config, cluster_device_filters=cluster_device_filters) if context.get_server_def() is None: context.set_server_def(server_def) else: context.update_server_def(server_def) if make_master_device_default and isinstance( cluster_spec_or_resolver, cluster_resolver.ClusterResolver ) and cluster_spec_or_resolver.master(): master = cluster_spec_or_resolver.master() master_job_name = None master_task_id = None for job_name in cluster_spec.jobs: for task_id in cluster_spec.task_indices(job_name): task_address = cluster_spec.task_address(job_name, task_id) if master in task_address or task_address in master: master_job_name = job_name master_task_id = task_id break if not master_job_name: raise ValueError( "`make_master_device_default` is set to True but cannot find " "master %s in the cluster" % master) master_device = "/job:{}/replica:0/task:{}".format( master_job_name, master_task_id) master_device = device_util.canonicalize(master_device) current_device = device_util.current() if current_device: current_device = device_util.canonicalize(current_device) if current_device and current_device != master_device: raise ValueError( "`connect_to_cluster` is called inside existing device " "scope %s, which is different from the master device " "scope %s to enter. This is not allowed." % (current_device, master_device)) # TODO(b/138389076): Think of the entering device scope behavior in the # failure recovery case when dealing with preemptions. if not current_device: logging.info("Entering into master device scope: %s", master_device) ops.device(master_device).__enter__()
def testCurrentDeviceWithEager(self): with context.eager_mode(): with ops.device("/cpu:0"): self.assertEqual( device_util.current(), "/job:localhost/replica:0/task:0/device:CPU:0")
def testCurrentDeviceWithNonGlobalGraph(self): with ops.Graph().as_default(): with ops.device("/cpu:0"): self.assertEqual(device_util.current(), "/device:CPU:0")
def testCurrentDeviceWithEager(self): with context.eager_mode(): with ops.device("/cpu:0"): self.assertEqual(device_util.current(), "/job:localhost/replica:0/task:0/device:CPU:0")