def testSimpleSuccessfulRetrieval(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', 'port': '8470', 'health': 'HEALTHY' } } resolver = TPUClusterResolver( project='test-project', zone='us-central1-c', tpu=['test-tpu-1'], coordinator_name='coordinator', coordinator_address='10.128.1.5:10203', credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } } job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } } """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
def testRetrieveProjectAndZoneFromMetadata(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', 'port': '8470', 'health': 'HEALTHY' } } resolver = TPUClusterResolver( project=None, zone=None, tpu=['test-tpu-1'], credentials=None, service=self.mock_service_client(tpu_map=tpu_map), coordinator_name='coordinator') actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'coordinator' tasks { key: 0 value: '10.128.1.2:%s' } } job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } } """ % resolver._coordinator_port self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto)) self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
def testNewNetworkEndpointFormat(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'health': 'HEALTHY', 'networkEndpoints': [{ 'ipAddress': '10.2.3.4', 'port': 8470, }] } } resolver = TPUClusterResolver( project='test-project', zone='us-central1-c', tpu='test-tpu-1', coordinator_name='coordinator', coordinator_address='10.128.1.5:10203', credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } } job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } } """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
def testGkeEnvironmentForPod(self): os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,' 'grpc://10.120.27.6:8470,' 'grpc://10.120.27.7:8470,' 'grpc://10.120.27.8:8470') self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ) self.assertTrue(TPUClusterResolver._inGke()) self.assertEqual( compat.as_bytes('grpc://10.120.27.5:8470,' 'grpc://10.120.27.6:8470,' 'grpc://10.120.27.7:8470,' 'grpc://10.120.27.8:8470'), compat.as_bytes(TPUClusterResolver._gkeEndpoints())) resolver = TPUClusterResolver() self.assertEqual( compat.as_bytes('grpc://10.120.27.5:8470'), compat.as_bytes(resolver.master())) actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'worker' tasks { key: 0 value: '10.120.27.5:8470' } tasks { key: 1 value: '10.120.27.6:8470' } tasks { key: 2 value: '10.120.27.7:8470' } tasks { key: 3 value: '10.120.27.8:8470' } } """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
def testNumAcceleratorsRetryFailure(self, mock_list_devices, mock_eager_list_devices): resolver = TPUClusterResolver(tpu='') mock_list_devices.side_effect = errors.DeadlineExceededError( None, None, 'timeout') mock_eager_list_devices.side_effect = errors.DeadlineExceededError( None, None, 'timeout') with self.assertRaises(RuntimeError): resolver.num_accelerators()
def verifyShouldResolve(self, tpu, should_resolve): resolver = TPUClusterResolver( project='test-project', zone='us-central1-c', tpu=tpu, coordinator_name=None, credentials=None, service=self.mock_service_client(tpu_map={})) self.assertEqual(should_resolve, resolver._shouldResolve(), "TPU: '%s'" % tpu)
def testVerifySameCoreCount(self): self.assertEqual( TPUClusterResolver._verify_and_return_same_core_count( {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8) self.assertEqual( TPUClusterResolver._verify_and_return_same_core_count( {0: [0, 1], 1: [2, 3]}), 2) with self.assertRaises(RuntimeError): TPUClusterResolver._verify_and_return_same_core_count( {0: [0], 1: [1, 2]})
def testPodResolution(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'health': 'HEALTHY', 'networkEndpoints': [ { 'ipAddress': '10.2.3.4', 'port': 8470, }, { 'ipAddress': '10.2.3.5', 'port': 8470, }, { 'ipAddress': '10.2.3.6', 'port': 8470, }, { 'ipAddress': '10.2.3.7', 'port': 8470, }, ] } } resolver = TPUClusterResolver( tpu='test-tpu-1', credentials=None, service=self.mock_service_client(tpu_map=tpu_map), coordinator_name='coordinator') actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'coordinator', tasks { key: 0 value: '10.128.1.2:%s'} } job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } tasks { key: 1 value: '10.2.3.5:8470' } tasks { key: 2 value: '10.2.3.6:8470' } tasks { key: 3 value: '10.2.3.7:8470' } } """ % resolver._coordinator_port self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto)) self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices in a separate session and graph. Args: cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.contrib.tpu.Topology object for the topology of the TPU cluster. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") master = cluster_resolver.master() logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") return topology.Topology(serialized=serialized_topology)
def testNotReadyCloudTpu(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', 'port': '8470', 'state': 'CREATING' } } resolver = TPUClusterResolver( project=None, zone=None, tpu='test-tpu-1', coordinator_name=None, credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) with self.assertRaises(RuntimeError): resolver.cluster_spec()
def testNumAcceleratorsSuccess(self, mock_list_devices): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes( name, 'TPU', 1024, 0) for name in device_names ] mock_list_devices.return_value = device_list resolver = TPUClusterResolver(tpu='') self.assertEqual(resolver.num_accelerators(), 2)
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices in a separate session and graph. Args: cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.contrib.tpu.Topology object for the topology of the TPU cluster. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") master = cluster_resolver.master() logging.info("Initializing the TPU system.") session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") return topology.Topology(serialized=serialized_topology)
def testGkeEnvironmentForPod(self): os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,' 'grpc://10.120.27.6:8470,' 'grpc://10.120.27.7:8470,' 'grpc://10.120.27.8:8470') self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ) self.assertTrue(TPUClusterResolver._inGke()) self.assertEqual( compat.as_bytes('grpc://10.120.27.5:8470,' 'grpc://10.120.27.6:8470,' 'grpc://10.120.27.7:8470,' 'grpc://10.120.27.8:8470'), compat.as_bytes(TPUClusterResolver._gkeEndpoints())) resolver = TPUClusterResolver() self.assertEqual( compat.as_bytes('grpc://10.120.27.5:8470'), compat.as_bytes(resolver.master())) actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'worker' tasks { key: 0 value: '10.120.27.5:8470' } tasks { key: 1 value: '10.120.27.6:8470' } tasks { key: 2 value: '10.120.27.7:8470' } tasks { key: 3 value: '10.120.27.8:8470' } } """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._tpu_devices = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] self._host_device = device_util.get_host_for_device( self._tpu_devices[0]) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True
def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', 'port': '8470', 'health': 'HEALTHY' } } resolver = TPUClusterResolver( project=None, zone=None, tpu=['test-tpu-1'], coordinator_name=None, credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) actual_cluster_spec = resolver.cluster_spec() expected_proto = """ job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } } """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
def testGetDeviceDictAndCoresWithCPUsAndGPUs(self): device_names = [ '/job:tpu_worker/task:0/device:CPU:0', '/job:tpu_worker/task:1/device:CPU:0', '/job:tpu_worker/task:2/device:CPU:0', '/job:tpu_worker/task:3/device:CPU:0', '/job:tpu_worker/task:0/device:GPU:1', '/job:tpu_worker/task:1/device:GPU:1', '/job:tpu_worker/task:2/device:GPU:1', '/job:tpu_worker/task:3/device:GPU:1', ] device_list = [ session._DeviceAttributes(name, 'XLA', 1024, 0) for name in device_names ] device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores( device_list) self.assertEqual(num_cores, 0) self.assertEqual(device_dict, {})
def testOverrideTaskTypeAndIndexAndGetMaster(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'health': 'HEALTHY', 'networkEndpoints': [ { 'ipAddress': '10.2.3.4', 'port': 8470, }, { 'ipAddress': '10.2.3.5', 'port': 8470, }, { 'ipAddress': '10.2.3.6', 'port': 8470, }, { 'ipAddress': '10.2.3.7', 'port': 8470, }, ] } } resolver = TPUClusterResolver( project='test-project', zone='us-central1-c', tpu='test-tpu-1', coordinator_name=None, credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470') resolver.task_type = 'worker' resolver.task_id = 3 self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470') self.assertEqual( resolver.master( task_type='worker', task_id=2, rpc_layer='test'), 'test://10.2.3.6:8470')
def testGetDeviceDictAndCoresWithCPUsAndGPUs(self): device_names = [ '/job:tpu_worker/task:0/device:CPU:0', '/job:tpu_worker/task:1/device:CPU:0', '/job:tpu_worker/task:2/device:CPU:0', '/job:tpu_worker/task:3/device:CPU:0', '/job:tpu_worker/task:0/device:GPU:1', '/job:tpu_worker/task:1/device:GPU:1', '/job:tpu_worker/task:2/device:GPU:1', '/job:tpu_worker/task:3/device:GPU:1', ] device_list = [ session._DeviceAttributes( name, 'XLA', 1024, 0) for name in device_names ] device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores( device_list) self.assertEqual(num_cores, 0) self.assertEqual(device_dict, {})
def testOverrideTaskTypeAndIndexAndGetMaster(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'health': 'HEALTHY', 'networkEndpoints': [ { 'ipAddress': '10.2.3.4', 'port': 8470, }, { 'ipAddress': '10.2.3.5', 'port': 8470, }, { 'ipAddress': '10.2.3.6', 'port': 8470, }, { 'ipAddress': '10.2.3.7', 'port': 8470, }, ] } } resolver = TPUClusterResolver( project='test-project', zone='us-central1-c', tpu='test-tpu-1', coordinator_name=None, credentials=None, service=self.mock_service_client(tpu_map=tpu_map)) self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470') resolver.task_type = 'worker' resolver.task_id = 3 self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470') self.assertEqual( resolver.master( task_type='worker', task_id=2, rpc_layer='test'), 'test://10.2.3.6:8470')
def testGetDeviceDictAndCoresWithTPUs(self): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes( name, 'TPU', 1024, 0) for name in device_names ] device_details = TPUClusterResolver._get_device_dict_and_cores( device_list) self.assertEqual(device_details.total_cores, 8) self.assertEqual(device_details.device_map, {'0': ['0', '4'], '1': ['1', '5'], '2': ['0', '4'], '3': ['1', '5']})
def testGetDeviceDictAndCoresWithTPUs(self): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes( name, 'TPU', 1024, 0) for name in device_names ] device_details = TPUClusterResolver._get_device_dict_and_cores( device_list) self.assertEqual(device_details.total_cores, 8) self.assertEqual(device_details.device_map, {'0': ['0', '4'], '1': ['1', '5'], '2': ['0', '4'], '3': ['1', '5']})
def shutdown_tpu_system(cluster_resolver=None): """Shuts down the TPU devices. This will clear all caches, even those that are maintained through sequential calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation cache. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution or if run in a tf.function. """ job = None if cluster_resolver is None: # If no cluster resolver is specified, and running eagerly, execute the init # ops in the current device scope. if context.executing_eagerly(): curr_device = device.DeviceSpec.from_string( context.context().device_name) if curr_device.job is not None: job = "{}/replica:0/task:0".format(curr_device.job) cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name not in _INITIALIZED_TPU_SYSTEMS: logging.warning( "You are shutting down a TPU system %s that has not been " "initialized.") logging.info("Shutting down the TPU system: %s", tpu_name) if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.shutdown_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # shutdown the TPU system. Thus, we can't simply run tpu.shutdown_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.shutdown_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) @function.defun def _tpu_shutdown_fn(): tpu.shutdown_system(job=job) # The TPU_SYSTEM device must match the device used in tpu.shutdown_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access _tpu_shutdown_fn() # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access elif not ops.executing_eagerly_outside_functions(): master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: sess.run(tpu.shutdown_system()) else: raise RuntimeError("initialize_tpu_system is not supported within " "tf.functions.") logging.info("Finished shutting down TPU system.") if tpu_name in _INITIALIZED_TPU_SYSTEMS: del _INITIALIZED_TPU_SYSTEMS[tpu_name]
def testLocalhostMaster(self): resolver = TPUClusterResolver(tpu='localhost:12345') self.assertEqual('localhost:12345', resolver.master())
def testNumAcceleratorsRetryFailure(self, mock_list_devices): resolver = TPUClusterResolver(tpu='') mock_list_devices.side_effect = errors.DeadlineExceededError( None, None, 'timeout') with self.assertRaises(RuntimeError): resolver.num_accelerators()
def testNoCallComputeMetadata(self): tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar') self.assertEqual( compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master()) self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
def testEnvironmentAndRpcDetectionForGrpcString(self): resolver = TPUClusterResolver(tpu='grpc://10.1.2.3:8470') self.assertEqual(resolver.environment, '') self.assertEqual(resolver.rpc_layer, 'grpc') self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. @function.defun def _tpu_init_fn(): return tpu.initialize_system() tpu_devices = sorted( [x for x in context.list_devices() if "device:TPU:" in x]) if not tpu_devices: raise RuntimeError("Could not find any TPU devices") # Replace the remote TPU device with the remote TPU_SYSTEM system device. As # in the remote TPU device case, we will try to compile it instead of # running through optimization passes and TF Executor, but TPU_SYSTEM should # work. tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM") with ops.device(tpu_system_device): output = _tpu_init_fn() serialized_topology = output.numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def testEnvironmentDiscoveryUrl(self): os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}' self.assertEqual('https://{api}.internal/{apiVersion}', TPUClusterResolver._environmentDiscoveryUrl())
def testIsNotRunningInGce(self): self.assertFalse(TPUClusterResolver._isRunningInGCE())
def testIsRunningInGce(self): self.assertTrue(TPUClusterResolver._isRunningInGCE())
def testCheckRunningInGceWithNoTpuName(self): with self.assertRaisesRegexp(RuntimeError, '.*Google Cloud.*'): TPUClusterResolver(tpu='')
def testIsRunningInGce(self): self.assertTrue(TPUClusterResolver._isRunningInGCE())
def testIsNotRunningInGce(self): self.assertFalse(TPUClusterResolver._isRunningInGCE())
def testNoCallComputeMetadata(self): resolver = TPUClusterResolver( tpu='/bns/foo/bar') self.assertEqual('/bns/foo/bar', resolver.master()) self.assertEqual(None, resolver.cluster_spec())
def testEnvironmentAndRpcDetectionForGrpcString(self): resolver = TPUClusterResolver( tpu='grpc://10.1.2.3:8470') self.assertEqual(resolver.environment, '') self.assertEqual(resolver.rpc_layer, 'grpc') self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 # `self._tpu_function_cache` is a dict of `tf.function`s, thus if a # `tf.function` is passed into `strategy.run` in eager mode, the # `tf.function` won't get retraced. self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata( ) self._device_assignment = device_assignment tpu_devices_flat = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is # indexed using `[replica_id][logical_device_id]`. if device_assignment is None: self._tpu_devices = np.array([[d] for d in tpu_devices_flat], dtype=object) else: job_name = device_spec.DeviceSpecV2.from_string( tpu_devices_flat[0]).job tpu_devices = [] for replica_id in range(device_assignment.num_replicas): replica_devices = [] for logical_core in range( device_assignment.num_cores_per_replica): replica_devices.append( device_util.canonicalize( device_assignment.tpu_device( replica=replica_id, logical_core=logical_core, job=job_name))) tpu_devices.append(replica_devices) self._tpu_devices = np.array(tpu_devices, dtype=object) self._host_device = device_util.get_host_for_device( self._tpu_devices[0][0]) # Preload the data onto the TPUs. Currently we always preload onto logical # device 0 for each replica. # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the # input onto a different logical device? self._device_input_worker_devices = collections.OrderedDict() self._host_input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices[:, 0]: host_device = device_util.get_host_for_device(tpu_device) self._device_input_worker_devices.setdefault(host_device, []) self._device_input_worker_devices[host_device].append(tpu_device) self._host_input_worker_devices.setdefault(host_device, []) self._host_input_worker_devices[host_device].append(host_device) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True self.experimental_enable_get_next_as_optional = True self._logical_device_stack = [0] if context.executing_eagerly(): # In async remote eager, we want to sync the executors before exiting the # program. def async_wait(): if context.context()._context_handle is not None: # pylint: disable=protected-access context.async_wait() atexit.register(async_wait) # Flag to turn on VariablePolicy self._use_var_policy = False
def testNoCallComputeMetadata(self): resolver = TPUClusterResolver(tpu='/bns/foo/bar') self.assertEqual('/bns/foo/bar', resolver.master()) self.assertEqual(None, resolver.cluster_spec())
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution or if run in a tf.function. """ job = None if cluster_resolver is None: # If no cluster resolver is specified, and running eagerly, execute the init # ops in the current device scope. if context.executing_eagerly(): curr_device = device.DeviceSpec.from_string(context.context().device_name) if curr_device.job is not None: job = "{}/replica:0/task:0".format(curr_device.job) cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning( "TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.", tpu_name) logging.info("Initializing the TPU system: %s", tpu_name) if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) @function.defun def _tpu_init_fn(): # In TF1, we usually close chips when compilation fails to clear the data # in infeed. In TF2, we don't need to do this because infeed is no longer # used, so user can recover from TPU compilation failures more smoothly. return tpu.initialize_system( job=job, compilation_failure_closes_chips=False) # The TPU_SYSTEM device must match the device used in tpu.initialize_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access output = _tpu_init_fn() # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access serialized_topology = output.numpy() # TODO(b/134094971): Remove this when lazy tensor copy in multi-device # function has been implemented. context.context().mirroring_policy = context.MIRRORING_ALL elif not ops.executing_eagerly_outside_functions(): master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) else: raise RuntimeError("initialize_tpu_system is not supported within " "tf.functions.") logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def testEnvironmentDiscoveryUrl(self): os.environ[ 'TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}' self.assertEqual('https://{api}.internal/{apiVersion}', (TPUClusterResolver._environmentDiscoveryUrl()))
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment self._tpu_devices = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # Only create variables for the number of replicas we're running. if device_assignment is not None: job_name = device_spec.DeviceSpecV2.from_string( self._tpu_devices[0]).job self._tpu_devices = [] for replica_id in range(device_assignment.num_replicas): tpu_device = device_assignment.tpu_device(replica=replica_id, logical_core=0, job=job_name) tpu_device = device_util.canonicalize(tpu_device) self._tpu_devices.append(tpu_device) self._host_device = device_util.get_host_for_device( self._tpu_devices[0]) self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't # need to retrace functions for each device. self._retrace_functions_for_each_device = False self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True
def testEnvironmentAndRpcDetectionForGoogle(self): resolver = TPUClusterResolver(tpu='/bns/ab/cd/ef') self.assertEqual(resolver.environment, 'google') self.assertEqual(resolver.rpc_layer, None)
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system: %s", tpu_name) if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. job = None if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) @function.defun def _tpu_init_fn(): return tpu.initialize_system(job=job) # The TPU_SYSTEM device must match the device used in tpu.initialize_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access output = _tpu_init_fn() # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access serialized_topology = output.numpy() else: master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_function_cache = weakref.WeakKeyDictionary() self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment tpu_devices_flat = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] # `self._tpu_devices` is a two-dimensional NumPy array of strings. It is # indexed using `[replica_id][logical_device_id]`. if device_assignment is None: self._tpu_devices = np.array([[d] for d in tpu_devices_flat], dtype=object) else: job_name = device_spec.DeviceSpecV2.from_string( tpu_devices_flat[0]).job tpu_devices = [] for replica_id in range(device_assignment.num_replicas): replica_devices = [] for logical_core in range( device_assignment.num_cores_per_replica): replica_devices.append( device_util.canonicalize( device_assignment.tpu_device( replica=replica_id, logical_core=logical_core, job=job_name))) tpu_devices.append(replica_devices) self._tpu_devices = np.array(tpu_devices, dtype=object) self._host_device = device_util.get_host_for_device( self._tpu_devices[0][0]) # Preload the data onto the TPUs. Currently we always preload onto logical # device 0 for each replica. # TODO(cjfj): Create `InputWorkers` lazily, allowing users to place the # input onto a different logical device? input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices[:, 0]: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_worker_devices = tuple(input_worker_devices.items()) self._input_workers_obj = None # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True # TPUStrategy handles the graph replication in TF-XLA bridge, so we don't # need to retrace functions for each device. self._retrace_functions_for_each_device = False self.experimental_enable_get_next_as_optional = True self.experimental_enable_dynamic_batch_size = True self._prefetch_on_host = False self._logical_device_stack = [0]
def __init__(self, container_strategy, tpu_cluster_resolver=None, steps_per_run=None, device_assignment=None): super(TPUExtended, self).__init__(container_strategy) if tpu_cluster_resolver is None: tpu_cluster_resolver = TPUClusterResolver("") if steps_per_run is None: # TODO(frankchn): Warn when we are being used by DS/Keras and this is # not specified. steps_per_run = 1 self._tpu_cluster_resolver = tpu_cluster_resolver self._tpu_metadata = get_tpu_system_metadata( self._tpu_cluster_resolver) self._device_assignment = device_assignment # Device assignment is currently only supported for 1 core case. if self._device_assignment: assert isinstance(self._device_assignment, device_assignment_lib.DeviceAssignment) if self._device_assignment.num_replicas != 1: raise ValueError( "Device assignment is only supported for a single " "core single replica case currently.") if self._device_assignment.num_cores_per_replica != 1: raise ValueError( "Device assignment is only supported for a single " "core single replica case currently.") if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]): raise ValueError( "Device assignment is only supported for a single " "core single replica case currently.") # TODO(jhseu): Switch to DeviceAssignment to support pods and model # parallelism. self._tpu_devices = [ d.name for d in self._tpu_metadata.devices if "device:TPU:" in d.name ] self._host_device = device_util.get_host_for_device( self._tpu_devices[0]) # Only create variables for the number of replicas we're running. self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync] self._device_map = values.ReplicaDeviceMap(self._tpu_devices) # Preload the data onto the TPUs. input_worker_devices = collections.OrderedDict() for tpu_device in self._tpu_devices: host_device = device_util.get_host_for_device(tpu_device) input_worker_devices.setdefault(host_device, []) input_worker_devices[host_device].append(tpu_device) self._input_workers = input_lib.InputWorkers( self._device_map, tuple(input_worker_devices.items())) # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run self._require_static_shapes = True
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access tpu_devices = sorted( [x for x in context.list_devices() if "device:TPU:" in x]) if not tpu_devices: raise RuntimeError("Could not find any TPU devices") with ops.device(device_util.get_host_for_device(tpu_devices[0])): output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology