def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._device_assignment = device_assignment # Device assignment is currently only supported for 1 core case. if self._device_assignment: assert isinstance(self._device_assignment, device_assignment_lib.DeviceAssignment) if self._device_assignment.num_replicas != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if self._device_assignment.num_cores_per_replica != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]): raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._tpu_devices = [d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name] self._host_device = device_util.get_host_for_device(self._tpu_devices[0]) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment self._tpu_devices = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # Only create variables for the number of replicas we're running. if device_assignment is not None: job_name = device_spec.DeviceSpecV2.from_string( self._tpu_devices[0]).job self._tpu_devices = [] for replica_id in range(device_assignment.num_replicas): tpu_device = device_assignment.tpu_device(replica=replica_id, logical_core=0, job=job_name) tpu_device = device_util.canonicalize(tpu_device) self._tpu_devices.append(tpu_device) self._host_device = device_util.get_host_for_device( self._tpu_devices[0]) self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True
def _input_workers_with_options(self, options=None): host_device = device_util.get_host_for_device(self._worker_device) if not options or options.experimental_prefetch_to_device: return input_lib.InputWorkers([(host_device, self.worker_devices)]) else: return input_lib.InputWorkers([(host_device, [ device_util.get_host_for_device(worker) for worker in self.worker_devices ])])
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._device_assignment = device_assignment # Device assignment is currently only supported for 1 core case. if self._device_assignment: assert isinstance(self._device_assignment, device_assignment_lib.DeviceAssignment) if self._device_assignment.num_replicas != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if self._device_assignment.num_cores_per_replica != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]): raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._tpu_devices = [d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name] self._host_device = device_util.get_host_for_device(self._tpu_devices[0]) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True
def testTPU(self, input_type, api_type, iteration_type, distribution, enable_get_next_as_optional): worker_device_pairs = collections.OrderedDict() for tpu_device in distribution.extended._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) worker_device_pairs.setdefault(host_device, []) worker_device_pairs[host_device].append(tpu_device) worker_device_pairs = worker_device_pairs.items() if tf2.enabled(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(10) else: dataset_fn = lambda _: dataset_ops.Dataset.range(10) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) expected_values = [[i, i + 1] for i in range(0, 10, 2)] distribution.extended.experimental_enable_get_next_as_optional = ( enable_get_next_as_optional) self._test_input_iteration( input_type, api_type, iteration_type, dataset_or_input_fn, worker_device_pairs, expected_values, distribution)
def _set_prefetch_on_host(self, value): if self._prefetch_on_host == value: return if self._input_workers_obj is not None: raise RuntimeError("Unable to change prefetch on host behavior as " "InputWorkers are already created.") self._prefetch_on_host = value if value: # To prefetch on the host, we must set all the input worker devices to the # corresponding host devices. self._input_worker_devices = tuple([ tuple([host, [device_util.get_host_for_device(d) for d in devices]]) for host, devices in self._input_worker_devices]) # Force creation of the workers. workers = self._input_workers del workers
def testTPU(self, input_type, api_type, iteration_type, distribution, enable_get_next_as_optional): worker_device_pairs = collections.OrderedDict() for tpu_device in distribution.extended._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) worker_device_pairs.setdefault(host_device, []) worker_device_pairs[host_device].append(tpu_device) worker_device_pairs = worker_device_pairs.items() if tf2.enabled(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(10) else: dataset_fn = lambda _: dataset_ops.Dataset.range(10) expected_values = [[i, i + 1] for i in range(0, 10, 2)] self._test_input_iteration( input_type, api_type, iteration_type, dataset_fn, worker_device_pairs, expected_values, distribution, enable_get_next_as_optional=enable_get_next_as_optional)
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment tpu_devices_flat = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is # indexed using `[replica_id][logical_device_id]`. if device_assignment is None: self._tpu_devices = np.array([[d] for d in tpu_devices_flat], dtype=object) else: job_name = device_spec.DeviceSpecV2.from_string( tpu_devices_flat[0]).job tpu_devices = [] for replica_id in range(device_assignment.num_replicas): replica_devices = [] for logical_core in range( device_assignment.num_cores_per_replica): replica_devices.append( device_util.canonicalize( device_assignment.tpu_device( replica=replica_id, logical_core=logical_core, job=job_name))) tpu_devices.append(replica_devices) self._tpu_devices = np.array(tpu_devices, dtype=object) self._host_device = device_util.get_host_for_device( self._tpu_devices[0][0]) # Preload the data onto the TPUs. Currently we always preload onto logical # device 0 for each replica. # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the # input onto a different logical device? input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices[:, 0]: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_worker_devices = tuple(input_worker_devices.items()) self._input_workers_obj = None # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't # need to retrace functions for each device. self._retrace_functions_for_each_device = False self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True self._prefetch_on_host = False self._logical_device_stack = [0]
def __init__(self, container_strategy, device): super(OneDeviceExtended, self).__init__(container_strategy) self._device = device_util.resolve(device) self._input_device = device_util.get_host_for_device(self._device)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = config_pb2.ConfigProto() config_proto = self._update_config_proto(config_proto) if hasattr(cluster_resolver, "port"): port = cluster_resolver.port else: port = 0 server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc", port=port) context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys, communication=self._communication) super(CollectiveAllReduceExtended, self)._initialize_single_worker(local_devices) host_device = device_util.get_host_for_device(self._worker_device) self._input_workers = input_lib.InputWorkers([(host_device, self.worker_devices)]) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info( "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, " "task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication)
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 # `self._tpu_function_cache` is a dict of `tf.function`s, thus if a # `tf.function` is passed into `strategy.run` in eager mode, the # `tf.function` won't get retraced. self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata( ) self._device_assignment = device_assignment tpu_devices_flat = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is # indexed using `[replica_id][logical_device_id]`. if device_assignment is None: self._tpu_devices = np.array([[d] for d in tpu_devices_flat], dtype=object) else: job_name = device_spec.DeviceSpecV2.from_string( tpu_devices_flat[0]).job tpu_devices = [] for replica_id in range(device_assignment.num_replicas): replica_devices = [] for logical_core in range( device_assignment.num_cores_per_replica): replica_devices.append( device_util.canonicalize( device_assignment.tpu_device( replica=replica_id, logical_core=logical_core, job=job_name))) tpu_devices.append(replica_devices) self._tpu_devices = np.array(tpu_devices, dtype=object) self._host_device = device_util.get_host_for_device( self._tpu_devices[0][0]) # Preload the data onto the TPUs. Currently we always preload onto logical # device 0 for each replica. # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the # input onto a different logical device? self._device_input_worker_devices = collections.OrderedDict() self._host_input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices[:, 0]: host_device = device_util.get_host_for_device(tpu_device) self._device_input_worker_devices.setdefault(host_device, []) self._device_input_worker_devices[host_device].append(tpu_device) self._host_input_worker_devices.setdefault(host_device, []) self._host_input_worker_devices[host_device].append(host_device) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True self._logical_device_stack = [0] if context.executing_eagerly(): # In async remote eager, we want to sync the executors before exiting the # program. def async_wait(): if context.context()._context_handle is not None: # pylint: disable=protected-access context.async_wait() atexit.register(async_wait) # Flag to turn on VariablePolicy self._use_var_policy = False
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access tpu_devices = sorted( [x for x in context.list_devices() if "device:TPU:" in x]) if not tpu_devices: raise RuntimeError("Could not find any TPU devices") with ops.device(device_util.get_host_for_device(tpu_devices[0])): output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology