def _create_tpu_strategy(): resolver = tpu_cluster_resolver.TPUClusterResolver("") topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib. SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs) else: return tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs)
def test_device_assignment_constants(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) self.assertAllEqual([[[0, 0, 0, 0]]], device_assignment.core_assignment) self.assertEqual(1, device_assignment.num_cores_per_replica) self.assertEqual(1, device_assignment.num_replicas) self.assertEqual("/task:0/device:TPU:0", device_assignment.tpu_device()) self.assertEqual("/task:0/device:CPU:0", device_assignment.host_device())
def _create_tpu_strategy(): FLAGS = flags.FLAGS # pylint: disable=invalid-name global _did_connect_to_cluster global _topology try: # Attempt to locally discover the TPU. This will fail for Cloud TPU, in # which case we fall back to the values passed as flags. resolver = tpu_cluster_resolver.TPUClusterResolver() did_automatically_resolve = True except ValueError: did_automatically_resolve = False # These flags will be defined by tpu_test_wrapper.py. resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "", zone=hasattr(FLAGS, "zone") and FLAGS.zone or None, project=hasattr(FLAGS, "project") and FLAGS.project or None, ) # Only connect once per process, rather than per test method. if not _did_connect_to_cluster: if getattr(FLAGS, "tpu", "") or did_automatically_resolve: remote.connect_to_cluster(resolver) _did_connect_to_cluster = True _topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( _topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): strategy = tpu_lib.TPUStrategyV2( resolver, device_assignment, experimental_spmd_xla_partitioning=enable_spmd_xla_paritioning, **kwargs) else: strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs) if enable_packed_variable and enable_spmd_xla_paritioning: raise ValueError( "Packed Variable is not compatiable with SPMD mode") strategy._enable_packed_variable_in_eager_mode = enable_packed_variable # pylint: disable=protected-access return strategy
def test_model_parallelism(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=[[[0, 0, 0, 0], [0, 0, 0, 1]]]) strategy = tpu_lib.TPUStrategyV2( resolver, experimental_device_assignment=device_assignment) with strategy.scope(): v = variables.Variable(2.) with strategy.extended.experimental_logical_device(1): w = variables.Variable(3.) self.assertLen(strategy.experimental_local_results(v), 1) self.assertLen(strategy.experimental_local_results(w), 1) self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:0", strategy.experimental_local_results(v)[0].device) self.assertEqual("/job:localhost/replica:0/task:0/device:TPU:1", strategy.experimental_local_results(w)[0].device) logical_devices = [] @def_function.function def f(x): replica_ctx = distribution_strategy_context.get_replica_context() with replica_ctx.experimental_logical_device(0): y = v * x with replica_ctx.experimental_logical_device(1): z = w * y logical_devices.append((y.device, z.device)) return z result = strategy.run(f, args=(5.,)) self.assertEqual( [("/device:TPU_REPLICATED_CORE:0", "/device:TPU_REPLICATED_CORE:1")], logical_devices) with self.cached_session(): self.evaluate(variables.global_variables_initializer()) self.assertEqual(30., self.evaluate(result))
def test_computation_on_subset_cores(self, enable_packed_var): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) all_core_strategy = tpu_lib.TPUStrategyV2(resolver) all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var with all_core_strategy.scope(): v = variables.Variable(0.0, aggregation=variables.VariableAggregation.MEAN) # Computation on the 1st core. device_assignment = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=1) first_core_strategy = tpu_lib.TPUStrategyV2( resolver, experimental_device_assignment=device_assignment) first_core_strategy._enable_packed_variable_in_eager_mode = ( enable_packed_var) # Computation on the 2nd core. device_assignment2 = device_assignment_lib.DeviceAssignment( topology, [[[0, 0, 0, 1]]]) second_core_strategy = tpu_lib.TPUStrategyV2( resolver, experimental_device_assignment=device_assignment2) second_core_strategy._enable_packed_variable_in_eager_mode = ( enable_packed_var) @def_function.function def train_step(): def step_fn(): return v + 1.0 all_core_strategy.run(step_fn) r1 = first_core_strategy.run(step_fn) r2 = second_core_strategy.run(step_fn) return r1 + r2 train_step() self.assertAllEqual(2., train_step())
def test_variables_mismatched_device_assignment(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) strategy0 = tpu_lib.TPUStrategyV2(resolver) self.assertEqual( ("/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1"), strategy0.extended.worker_devices) with strategy0.scope(): v = variables.Variable(1.) v1_assign_op = strategy0.experimental_local_results(v)[1].assign(42.) with self.cached_session(): self.evaluate(variables.global_variables_initializer()) self.evaluate(v1_assign_op) self.assertAllEqual([1., 42.], self.evaluate( strategy0.experimental_local_results(v))) # Second strategy has devices reversed relative to the first. device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=[[[0, 0, 0, 1]], [[0, 0, 0, 0]]]) strategy1 = tpu_lib.TPUStrategyV2( resolver, experimental_device_assignment=device_assignment) self.assertEqual( ("/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:0"), strategy1.extended.worker_devices) v_read = strategy1.run(def_function.function(v.read_value)) with self.cached_session(): self.assertAllEqual([42., 1.], self.evaluate( strategy0.experimental_local_results(v_read)))
def _create_tpu_strategy(): global _did_connect_to_cluster try: # Attempt to locally discover the TPU. This will fail for Cloud TPU, in # which case we fall back to the values passed as flags. resolver = tpu_cluster_resolver.TPUClusterResolver() did_automatically_resolve = True except ValueError: did_automatically_resolve = False # These flags will be defined by tpu_test_wrapper.py. resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "", zone=hasattr(FLAGS, "zone") and FLAGS.zone or None, project=hasattr(FLAGS, "project") and FLAGS.project or None, ) # Only connect once per process, rather than per test method. if getattr(FLAGS, "tpu", "") or did_automatically_resolve: if not _did_connect_to_cluster: remote.connect_to_cluster(resolver) _did_connect_to_cluster = True topology = tpu_strategy_util.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT) # Steps per run is only supported in TF 1.x if tf2.enabled(): return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs) else: return tpu_lib.TPUStrategyV1(resolver, steps_per_run, device_assignment, **kwargs)
def test_worker_devices_on_subset_cores(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) topology = tpu_strategy_util.initialize_tpu_system(resolver) # Strategy for the 1st core. device_assignment = device_assignment_lib.DeviceAssignment.build( topology, num_replicas=1) first_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment) # Strategy for the 2nd core. device_assignment2 = device_assignment_lib.DeviceAssignment( topology, [[[0, 0, 0, 1]]]) second_core_strategy = tpu_lib.TPUStrategy( resolver, device_assignment=device_assignment2) self.assertLen(first_core_strategy.extended.worker_devices, 1) self.assertEndsWith(first_core_strategy.extended.worker_devices[0], "device:TPU:0") self.assertLen(second_core_strategy.extended.worker_devices, 1) self.assertEndsWith(second_core_strategy.extended.worker_devices[0], "device:TPU:1")
def get_core_assignment(*core_ids): return device_assignment_lib.DeviceAssignment(get_topology(), [[get_topology().device_coordinates[0][i]] for i in core_ids])