def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = resolver_lib.TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._device_assignment = device_assignment # Device assignment is currently only supported for 1 core case. if self._device_assignment: assert isinstance(self._device_assignment, device_assignment_lib.DeviceAssignment) if self._device_assignment.num_replicas != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if self._device_assignment.num_cores_per_replica != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]): raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._device_index = { d.name: i for i, d in enumerate(self._tpu_metadata.devices) if "device:TPU:" in d.name } self._host_device = self.get_host_cpu_device(0) self._tpu_devices = tuple(sorted(self._device_index.keys())) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # For input: input_device_map = values.ReplicaDeviceMap(tuple( self.get_host_cpu_device(hid) for hid in range(self.num_hosts))) worker_devices = [ (self.get_host(hid), [self.get_host_cpu_device(hid)]) for hid in range(self.num_hosts) ] self._input_workers = input_lib.InputWorkers( input_device_map, worker_devices) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, num_cores=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = resolver_lib.TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) # TODO(sourabhbajaj): Change this from num_cores to metadata_override self._num_cores_override = num_cores # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._device_index = { d.name: i for i, d in enumerate(self._tpu_metadata.devices) if "device:TPU:" in d.name } self._host_device = self.get_host_cpu_device(0) self._tpu_devices = tuple(sorted(self._device_index.keys())) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # For input: input_device_map = values.ReplicaDeviceMap( tuple( self.get_host_cpu_device(hid) for hid in range(self.num_hosts))) worker_devices = [(self.get_host(hid), [self.get_host_cpu_device(hid)]) for hid in range(self.num_hosts)] self._input_workers = values.InputWorkers(input_device_map, worker_devices) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True # Initialize the TPU devices. self._initialize_tpu()
def testInitializableIterator(self): with context.graph_mode(): devices = ["/device:CPU:0"] # Using random input since that is only allowed with initializable # iterator. dataset = dataset_ops.Dataset.from_tensor_slices( random_ops.random_uniform((10,))) device_map = values.ReplicaDeviceMap(devices) input_workers = values.InputWorkers(device_map) per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0) iterator = per_replica_dataset.make_initializable_iterator() self.evaluate(iterator.initializer) next_element = iterator.get_next_as_list() for _ in range(10): self.evaluate(next_element) # Should fail after the input is finished. with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) # After re-initializing the iterator, should be able to iterate again. self.evaluate(iterator.initializer) for _ in range(10): self.evaluate(next_element)
def testMirroredStratParaAsync(self): """Tests RNG/MirrorStrategy interaction #3. The user can create n independent RNGs outside strategy.scope(), where n is the number of replicas, and give one to each replica. The replicas can thus get different random-number streams. """ shape = [3, 4] dtype = dtypes.int32 gens = random.get_global_generator().split(count=2) devices = ["/cpu:0", test_util.gpu_device_name()] strat = MirroredStrategy(devices=devices) # Use `PerReplica` to specify which `gen` is sent to which replica gens = dist_values.PerReplica( device_map=dist_values.ReplicaDeviceMap(devices), values=[[g] for g in gens]) with strat.scope(): def f(gen): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica( fn=f, args=gens) values = results.values self.assertAllEqual(2, len(values)) self.assertAllDifferent(values)
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._device_assignment = device_assignment # Device assignment is currently only supported for 1 core case. if self._device_assignment: assert isinstance(self._device_assignment, device_assignment_lib.DeviceAssignment) if self._device_assignment.num_replicas != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if self._device_assignment.num_cores_per_replica != 1: raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]): raise ValueError("Device assignment is only supported for a single " "core single replica case currently.") # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._tpu_devices = [d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name] self._host_device = device_util.get_host_for_device(self._tpu_devices[0]) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True
def _create_iterator(self, input_type, dataset_fn, worker_device_pairs, devices, split_batch_by, enable_get_next_as_optional): device_map = values.ReplicaDeviceMap(devices) input_workers = input_lib.InputWorkers(device_map, worker_device_pairs) if input_type == "input_fn": input_contexts = [] for i in range(input_workers.num_workers): input_contexts.append( distribute_lib.InputContext( num_input_pipelines=input_workers.num_workers, input_pipeline_id=i, num_replicas_in_sync=len(devices))) iterator = input_lib.InputFunctionIterator( dataset_fn, input_workers, input_contexts, _enable_get_next_as_optional=enable_get_next_as_optional) else: iterator = input_lib.DatasetIterator( dataset_fn(distribute_lib.InputContext()), input_workers, split_batch_by, _enable_get_next_as_optional=enable_get_next_as_optional) return iterator
def predict(actions, state): state = state.copy() # break down the inputs along the batch dimension to form equal sized # tensors in each replica. num_replicas = strategy.num_replicas_in_sync actions = tf.split(actions, num_replicas) state = { key: tf.split(value, num_replicas) for key, value in state.items() } devices = values.ReplicaDeviceMap(strategy.extended.worker_devices) dist_actions = values.PerReplica(devices, tuple(actions)) dist_state = [] for i in range(num_replicas): dist_state.append({key: value[i] for key, value in state.items()}) dist_state = values.PerReplica(devices, tuple(dist_state)) dist_predictions = strategy.experimental_run_v2(model.predict, args=(dist_actions, dist_state)) dist_predictions = { key: strategy.experimental_local_results(value) for key, value in dist_predictions.items() } predictions = { key: tf.concat(value, axis=0) for key, value in dist_predictions.items() } return predictions
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment self._tpu_devices = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # Only create variables for the number of replicas we're running. if device_assignment is not None: job_name = device_spec.DeviceSpecV2.from_string( self._tpu_devices[0]).job self._tpu_devices = [] for replica_id in range(device_assignment.num_replicas): tpu_device = device_assignment.tpu_device(replica=replica_id, logical_core=0, job=job_name) tpu_device = device_util.canonicalize(tpu_device) self._tpu_devices.append(tpu_device) self._host_device = device_util.get_host_for_device( self._tpu_devices[0]) self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True
def testNamedTupleEstimatorSpec(self): with context.graph_mode(), ops.Graph().as_default(): devices = [] created_estimator_specs = [] for device_id in range(3): spec = model_fn_lib.EstimatorSpec( mode=model_fn_lib.ModeKeys.TRAIN, loss=constant_op.constant(device_id / 2), train_op=array_ops.identity(constant_op.constant(device_id))) devices.append(_device_str(device_id)) created_estimator_specs.append(spec) device_map = values.ReplicaDeviceMap(devices) merged_estimator_spec = values.regroup( device_map, created_estimator_specs) self.assertTrue( isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec)) self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode) for device_id in range(3): d = _device_str(device_id) self.assertEqual(created_estimator_specs[device_id].loss, merged_estimator_spec.loss.get(d)) self.assertEqual(created_estimator_specs[device_id].train_op, merged_estimator_spec.train_op.get(d)) # Scaffold is populated by `EstimatorSpec.__new__`. self.assertEqual(created_estimator_specs[device_id].scaffold, merged_estimator_spec.scaffold.get(d)) # Also test that we can undo the merge using select_replica() self.assertEqual(created_estimator_specs[device_id], values.select_replica(device_id, merged_estimator_spec))
def testNested(self): device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1))) result = values.regroup(device_map, (_nested_value("1"), _nested_value("2"))) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"]) self._is_per_replica(result[2], ["h1", "h2"]) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"]) self._is_per_replica(result[1][2], ["g1", "g2"]) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"]) self._is_per_replica(result[1][1]["e"], ["f1", "f2"]) # Also test that we can undo the merge using select_replica() self.assertEqual(_nested_value("1"), values.select_replica(0, result)) self.assertEqual(_nested_value("2"), values.select_replica(1, result)) # select_device_mirrored() should fail due to non-mirrored values with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(0), result) with self.assertRaises(TypeError): values.select_device_mirrored(_device_str(1), result)
def testWrapClass(self): # Normally a mirrored value would be the same across devices, but # for a test it is convenient to be able to tell the values apart. device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1))) result = values.regroup(device_map, (_nested_value("1"), _nested_value("2")), values.Mirrored) self.assertIsInstance(result, tuple) self.assertEqual(3, len(result)) self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored) self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored) self.assertIsInstance(result[1], list) self.assertEqual(3, len(result[1])) self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored) self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored) self.assertIsInstance(result[1][1], dict) self.assertEqual(set(["c", "e"]), set(result[1][1].keys())) self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored) self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored) # Also test that we can undo the merge using select_replica() self.assertEqual(_nested_value("1"), values.select_replica(0, result)) self.assertEqual(_nested_value("2"), values.select_replica(1, result)) # Values are marked as mirrored, so select_device_mirrored() is allowed. self.assertEqual(_nested_value("1"), values.select_device_mirrored(_device_str(0), result)) self.assertEqual(_nested_value("2"), values.select_device_mirrored(_device_str(1), result))
def _make_mirrored_indexed_slices(devices, values, indices, dense_shape): values = [ _make_indexed_slices(values, indices, dense_shape, d) for d in devices ] return value_lib.regroup(value_lib.ReplicaDeviceMap(devices), values, wrap_class=value_lib.Mirrored)
def distributed_function(x, y, sample_weights, learning_phase=None): """A single step of the distributed execution across replicas.""" del learning_phase # TODO(b/129653859): Simplify after PerReplica can be the input of # `def_function.function`. `regroup` calls and re-wrapping in # PerReplica won't be needed then. if isinstance(strategy, one_device_strategy.OneDeviceStrategy): device_map = values.SingleDeviceMap(devices[0]) wrap_class = lambda d, x: x else: device_map = values.ReplicaDeviceMap(devices) wrap_class = values.PerReplica # Transform each lists of lists of values into per replica objects # in the case of mirrored strategy. For example, for 2 replicas: # [[x0, y0], [x1, y1]] > [PerReplica(d0:x0, d1:x1), # PerReplica(d0:y0, d1:y1)] x = values.regroup(device_map, x, wrap_class) y = values.regroup(device_map, y, wrap_class) if y else None sample_weights = values.regroup(device_map, sample_weights, wrap_class) if sample_weights else None # Call `Model.{train,test,predict}_on_batch` on every replica passing # PerReplicas as arguments. On every replica inside this call, each # PerReplica object will return the value for that replica. The outputs # are PerReplicas too. outputs = strategy.experimental_run_v2( per_replica_function, args=(x, y, sample_weights)) # Out of PerReplica outputs reduce or pick values to return. all_outputs = unwrap_outputs( strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT)) return all_outputs
def _initialize_multi_worker(self, devices): """Initializes the object for multi-worker training.""" self._local_mode = False assert devices, "Must specify at least one device." devices = tuple(device_util.resolve(d) for d in devices) assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument: %s" % devices) # TODO(josh11b): Require at least 2 devices? device_dict = _group_device_list(devices) workers = [] worker_devices = [] for job in ("chief", "worker"): for task in range(len(device_dict.get(job, []))): worker = "/job:%s/task:%d" % (job, task) workers.append(worker) worker_devices.append((worker, device_dict[job][task])) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = workers[0] self._host_input_device = numpy_dataset.SingleDevice(workers[0]) self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map, worker_devices) self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( workers, _infer_num_gpus_per_worker(devices))
def testWrapAListOfTwoTuples(self): device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1))) result = values.regroup(device_map, [("1", "2"), ("3", "4")]) self.assertIsInstance(result, tuple) self.assertEqual(2, len(result)) self._is_per_replica(result[0], ("1", "3"), values.PerReplica) self._is_per_replica(result[1], ("2", "4"), values.PerReplica)
def _initialize_local(self, cluster_resolver): """Initialize internal devices for local training.""" worker_device = device_util.canonicalize("/device:CPU:0") num_gpus = cluster_resolver.num_accelerators() # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus > 0: compute_devices = tuple( map("/device:GPU:{}".format, range(num_gpus))) else: compute_devices = (_LOCAL_CPU, ) self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) # If there is only one GPU, put everything on that GPU. Otherwise, place # variables on CPU. if num_gpus == 1: assert len(compute_devices) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = (_LOCAL_GPU_0, ) else: self._variable_device = _LOCAL_CPU self._parameter_devices = (_LOCAL_CPU, ) self._is_chief = True self._cluster_spec = None self._task_type = None self._task_id = None logging.info( "ParameterServerStrategy with compute_devices = %r, " "variable_device = %r", compute_devices, self._variable_device)
def observe(images, actions, rewards, state): images = tf.to_float(images) / 255.0 - 0.5 # break down the inputs along the batch dimension to form equal sized # tensors in each replica. num_replicas = strategy.num_replicas_in_sync images = tf.split(images, num_replicas) actions = tf.split(actions, num_replicas) state = { key: tf.split(value, num_replicas) for key, value in state.items() } devices = values.ReplicaDeviceMap(strategy.extended.worker_devices) dist_images = values.PerReplica(devices, tuple(images)) dist_actions = values.PerReplica(devices, tuple(actions)) dist_state = [] for i in range(num_replicas): dist_state.append({key: value[i] for key, value in state.items()}) dist_state = values.PerReplica(devices, tuple(dist_state)) _, dist_posteriors = strategy.experimental_run_v2(model.observe, args=(dist_actions, dist_images, dist_state)) dist_posteriors = { key: strategy.experimental_local_results(value) for key, value in dist_posteriors.items() } posteriors = { key: tf.concat(value, axis=0) for key, value in dist_posteriors.items() } posteriors = {key: value[:, -1] for key, value in posteriors.items()} posteriors['rewards'] = rewards[:, -1] return posteriors
def _initialize_local(self, devices): """Initializes the object for local training.""" self._local_mode = True self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map) self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
def testValueErrorForIterator(self): # Incompatiable arguments. d1 = "/device:GPU:0" d2 = "/device:GPU:1" device_map = values.ReplicaDeviceMap([d1, d2]) input_workers = values.InputWorkers( device_map, (("w1", (d1,)), ("w2", (d2,)))) with self.assertRaises(ValueError): values.MultiWorkerDataIterator([("w1", None)], input_workers)
def _fake_mirrored(value, devices): """Create a faked Mirrored object for testing. All components of the returned Mirrored have the same objects, which is not true in reality. """ devices = _get_devices(devices) return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), [value] * len(devices))
def _make_replica_local(method, strategy=None): device_map = values.ReplicaDeviceMap(_devices) v = [] for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]): with ops.device(d): v.append(variable_scope.get_variable( name=n, initializer=init, use_resource=True)) replica_local = values.ReplicaLocalVariable(strategy, device_map, v, method) return v, replica_local
def testOneDevice(self): device_map = values.ReplicaDeviceMap((_device_str(0),)) result = values.regroup(device_map, (_nested_value("1"),)) # On one device regroup() and select_replica() are basically identity. self.assertEqual(_nested_value("1"), result) self.assertEqual(_nested_value("1"), values.select_replica(0, result)) # The one exception has to do with MirroredVariables. d = "/device:CPU:0" with ops.device(d): v = variable_scope.get_variable( name="v", initializer=1., use_resource=True) device_map = values.ReplicaDeviceMap((d,)) mirrored = values.MirroredVariable(None, device_map, (v,), variable_scope.VariableAggregation.SUM) result = values.regroup(device_map, (v,)) self.assertIs(mirrored, result)
def testContainsIndexedSlices_PerReplica(self): t0 = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) device_map = value_lib.ReplicaDeviceMap(("/gpu:0", "/cpu:0")) per_replica = value_lib.PerReplica(device_map, (t0, t1)) self.assertTrue( cross_device_utils.contains_indexed_slices(per_replica))
def testPassPerReplica(self, distribution): @function.defun def fn1(mock_model, factor): return mock_model(factor) device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0")) factors = values.PerReplica(device_map, (5.0, 3.0)) expected_result = values.PerReplica(device_map, (5.0 * 1.25, 3.0 * 1.25)) self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
def _create_variable(self, next_creator, *args, **kwargs): colocate_with = kwargs.pop("colocate_with", None) if colocate_with is None: device_map = values.ReplicaDeviceMap([self._variable_device]) logical_device = 0 elif isinstance(colocate_with, numpy_dataset.SingleDevice): with ops.device(colocate_with.device): return next_creator(*args, **kwargs) else: device_map = colocate_with.device_map logical_device = colocate_with.logical_device def _real_creator(devices, *args, **kwargs): assert len(devices) == 1 assert devices[0] == self._variable_device # The chief worker will initialize and broadcast the value to # the other workers. Always done on the host. kwargs["initial_value"] = self._get_variable_creator_initial_value( replica_id=0, # First (and only) replica on each worker. device=self._host_device, primary_var=None, **kwargs) # We always place sync-on-read variables on the IPU. They will # be transfered and reduced on the hosts only when read. synchronization = kwargs.get("synchronization") if (not self._variables_on_host or synchronization == variable_scope.VariableSynchronization.ON_READ): with ops.device(self._ipu_device): return [next_creator(*args, **kwargs)] # Cache a snapshot of the variable on the IPU device, # otherwise the XLA cluster containing the ops consuming the # variable might be moved to the host to be colocated with it. kwargs["caching_device"] = self._ipu_device # In case we are inside an ipu_jit_scope, we need to override it # to disable XLA for variable initialization on the host. disable_xla = { "_XlaCompile": attr_value_pb2.AttrValue(b=False), "_XlaScope": attr_value_pb2.AttrValue(s=b''), } graph = ops.get_default_graph() with ops.device(self._host_device), \ graph._attr_scope(disable_xla): # pylint: disable=protected-access return [next_creator(*args, **kwargs)] # For tf1: use distribute_lib.create_mirrored_variable return values.create_mirrored_variable(self._container_strategy(), device_map, logical_device, _real_creator, IPUMirroredVariable, IPUSyncOnReadVariable, *args, **kwargs)
def testGetEager(self): with ops.device("/device:CPU:0"): one = constant_op.constant(1) two = constant_op.constant(2) device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0")) v = values.DistributedValues(device_map, (one, two)) self.assertEqual(two, v.get("/device:GPU:0")) self.assertEqual(one, v.get()) with self.assertRaises(ValueError): self.assertIsNone(v.get("/device:GPU:2"))
def __init__(self, container_strategy, ipu_device, cpu_device): super().__init__(container_strategy) self._ipu_device = ipu_device self._cpu_device = cpu_device device_map = values.ReplicaDeviceMap([self._cpu_device]) worker_device_pairs = [("", [self._cpu_device])] self._input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
def _test_dataset(self, dataset_fn, worker_devices, devices, expected_values): device_map = values.ReplicaDeviceMap(devices) input_workers = values.InputWorkers(device_map, worker_devices) multi_worker_dataset = values.MultiWorkerDataset( dataset_fn, input_workers) multi_worker_iterator = multi_worker_dataset.make_initializable_iterator() with self.cached_session() as sess: sess.run(multi_worker_iterator.initializer) self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
def testVariableOnAnotherDevice(self): v = variable_scope.get_variable( name="v", initializer=[1.], use_resource=True) device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",)) mirrored = values.MirroredVariable(None, device_map, (v,), variable_scope.VariableAggregation.MEAN) self.assertEqual(v.name, mirrored.name) self.assertEqual(v.dtype, mirrored.dtype) self.assertEqual(v.shape, mirrored.shape)
def testFetchAMirroredVariable(self, distribution): with self.session(graph=ops.Graph()) as sess, distribution.scope(): with ops.device("/device:GPU:0"): v = variable_scope.get_variable( name="v", initializer=1., use_resource=True) mirrored = values.MirroredVariable( distribution, values.ReplicaDeviceMap(("/device:GPU:0",)), (v,), variable_scope.VariableAggregation.MEAN) sess.run(variables_lib.global_variables_initializer()) sess.run({"complicated": mirrored})