def testNumAcceleratorsFilterTasks(self, mock_list_devices, mock_eager_list_devices): devices = [ LogicalDevice("/job:worker1/task:0/device:TPU:0", "TPU"), LogicalDevice("/job:worker1/task:0/device:TPU:1", "TPU"), LogicalDevice("/job:worker1/task:0/device:GPU:0", "GPU"), LogicalDevice("/job:worker1/task:0/device:GPU:1", "GPU"), LogicalDevice("/job:worker2/task:1/device:TPU:2", "TPU"), LogicalDevice("/job:worker2/task:2/device:TPU:3", "TPU"), LogicalDevice("/job:worker2/task:3/device:GPU:2", "GPU"), LogicalDevice("/job:worker2/task:4/device:GPU:3", "GPU"), ] device_list = [ session._DeviceAttributes(d.name, d.device_type, 1024, 0) for d in devices ] mock_eager_list_devices.return_value = devices mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual( resolver.num_accelerators(task_type="worker1", task_id=0), { "TPU": 2, "GPU": 2 }) self.assertEqual( resolver.num_accelerators(task_type="worker2", task_id=3), {"GPU": 1}) self.assertEqual( resolver.num_accelerators(task_type="worker2", task_id=4), {"GPU": 1})
def testGetDeviceDictAndCoresWithTPUs(self): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes(name, 'TPU', 1024, 0) for name in device_names ] device_details = tpu_cluster_resolver._get_device_dict_and_cores( device_list) self.assertEqual(device_details.total_cores, 8) self.assertEqual(device_details.device_map, { '0': ['0', '4'], '1': ['1', '5'], '2': ['0', '4'], '3': ['1', '5'] })
def testNumAcceleratorsFilterTasks(self, mock_list_devices, mock_eager_list_devices): device_names = [ "/job:worker1/task:0/device:TPU:0", "/job:worker1/task:0/device:TPU:1", "/job:worker1/task:0/device:GPU:0", "/job:worker1/task:0/device:GPU:1", "/job:worker2/task:1/device:TPU:2", "/job:worker2/task:2/device:TPU:3", "/job:worker2/task:3/device:GPU:2", "/job:worker2/task:4/device:GPU:3", ] device_list = [ session._DeviceAttributes(name, name[27:30], 1024, 0) for name in device_names ] mock_eager_list_devices.return_value = device_names mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(task_type="worker1", task_id=0), {"TPU": 2, "GPU": 2}) self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=3), {"GPU": 1}) self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=4), {"GPU": 1})
def testNumAcceleratorsSuccess(self, mock_list_devices, mock_eager_list_devices): devices = [ LogicalDevice('/job:tpu_worker/task:0/device:TPU:0', 'TPU'), LogicalDevice('/job:tpu_worker/task:1/device:TPU:1', 'TPU'), LogicalDevice('/job:tpu_worker/task:2/device:TPU:0', 'TPU'), LogicalDevice('/job:tpu_worker/task:3/device:TPU:1', 'TPU'), LogicalDevice('/job:tpu_worker/task:0/device:TPU:4', 'TPU'), LogicalDevice('/job:tpu_worker/task:1/device:TPU:5', 'TPU'), LogicalDevice('/job:tpu_worker/task:2/device:TPU:4', 'TPU'), LogicalDevice('/job:tpu_worker/task:3/device:TPU:5', 'TPU'), ] device_list = [ session._DeviceAttributes(d.name, d.device_type, 1024, 0) for d in devices ] mock_eager_list_devices.return_value = devices mock_list_devices.return_value = device_list tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'state': 'READY', 'health': 'HEALTHY', 'networkEndpoints': [ { 'ipAddress': '10.2.3.4', 'port': 8470, }, { 'ipAddress': '10.2.3.5', 'port': 8470, }, { 'ipAddress': '10.2.3.6', 'port': 8470, }, { 'ipAddress': '10.2.3.7', 'port': 8470, }, ] } } cluster_resolver = resolver.TPUClusterResolver( project='test-project', zone='us-central1-c', tpu='test-tpu-1', service=self.mock_service_client(tpu_map=tpu_map)) self.assertEqual(cluster_resolver.num_accelerators(), {'TPU': 2})
def testNumAcceleratorsFilterSuccess(self, mock_list_devices): device_names = [ "/job:worker/task:0/device:TPU:0", "/job:worker/task:0/device:TPU:1", "/job:worker/task:0/device:TPU:2", "/job:worker/task:0/device:TPU:3", ] device_list = [ session._DeviceAttributes( name, "TPU", 1024, 0) for name in device_names ] mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(), 0)
def testNumAcceleratorsFilterSuccess(self, mock_list_devices): device_names = [ "/job:worker/task:0/device:TPU:0", "/job:worker/task:0/device:TPU:1", "/job:worker/task:0/device:TPU:2", "/job:worker/task:0/device:TPU:3", ] device_list = [ session._DeviceAttributes(name, "TPU", 1024, 0) for name in device_names ] mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(), 0)
def get_accelerator_devices(master, config_proto): """Returns accelerator devices given a master and a configuration.""" if context.executing_eagerly(): logical_devices = config.list_logical_devices() devices = [] for d in logical_devices: if d.device_type == 'CPU' or d.device_type == 'XLA_CPU': # Filter CPUs continue devices.append( session._DeviceAttributes(d.name, d.device_type, 0, 0)) # pylint: disable=protected-access return devices else: with ops.Graph().as_default(): with session.Session(master, config=config_proto) as s: devices = s.list_devices() return devices
def testNumAcceleratorsSuccess(self, mock_list_devices, mock_eager_list_devices): devices = [ LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"), LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"), LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"), LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"), ] device_list = [ session._DeviceAttributes(d.name, d.device_type, 1024, 0) for d in devices ] mock_eager_list_devices.return_value = devices mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(), {"GPU": 4})
def list_devices(session): with errors.raise_exception_on_not_ok_status() as status: if session._created_with_new_api: raw_device_list = tf_session.TF_SessionListDevices( session._session, status) else: raw_device_list = tf_session.TF_DeprecatedSessionListDevices( session._session, status) device_list = [] size = tf_session.TF_DeviceListCount(raw_device_list) for i in range(size): name = tf_session.TF_DeviceListName(raw_device_list, i, status) device_type = tf_session.TF_DeviceListType(raw_device_list, i, status) memory = 0 # tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status) device_list.append(_DeviceAttributes(name, device_type, memory)) tf_session.TF_DeleteDeviceList(raw_device_list) return device_list
def testNumAcceleratorsSuccess(self, mock_list_devices): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes( name, 'TPU', 1024, 0) for name in device_names ] mock_list_devices.return_value = device_list resolver = cluster_resolver.TPUClusterResolver(tpu='') self.assertEqual(resolver.num_accelerators(), 2)
def get_accelerator_devices(master, config_proto): """Returns accelerator devices given a master and a configuration.""" if context.executing_eagerly(): device_names = context.list_devices() # list_devices returns list(string) devices = [] for name in device_names: device_type = 'GPU' # default device type is GPU device_match = DEVICE_TYPE_REGEX.match(name) if device_match: device_type = device_match.group(1) if device_type == 'CPU' or device_type == 'XLA_CPU': # Filter CPUs continue devices.append(session._DeviceAttributes(name, device_type, 0, 0)) # pylint: disable=protected-access return devices else: with ops.Graph().as_default(): with session.Session(master, config=config_proto) as s: devices = s.list_devices() return devices
def testGetDeviceDictAndCoresWithCPUsAndGPUs(self): devices = [ '/job:tpu_worker/task:0/device:CPU:0', '/job:tpu_worker/task:1/device:CPU:0', '/job:tpu_worker/task:2/device:CPU:0', '/job:tpu_worker/task:3/device:CPU:0', '/job:tpu_worker/task:0/device:GPU:1', '/job:tpu_worker/task:1/device:GPU:1', '/job:tpu_worker/task:2/device:GPU:1', '/job:tpu_worker/task:3/device:GPU:1', ] device_list = [ session._DeviceAttributes(name, 'XLA', 1024, 0) for name in devices ] device_dict, num_cores =\ resolver.TPUClusterResolver._get_device_dict_and_cores(device_list) self.assertEqual(num_cores, 0) self.assertEqual(device_dict, {})
def testNumAcceleratorsSuccess(self, mock_list_devices): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes(name, 'TPU', 1024, 0) for name in device_names ] mock_list_devices.return_value = device_list resolver = cluster_resolver.TPUClusterResolver(tpu='') self.assertEqual(resolver.num_accelerators(), 2)
def testNumAcceleratorsFilterTasksByEnvVar(self, mock_list_devices, mock_eager_list_devices): os.environ['TF_CONFIG'] = """ { "cluster": { "worker1": ["w10:2222"], "worker2": ["w21:2222", "w22:2222", "w23:2222", "w24:2222"] }, "rpc_layer": "grpc", "task": { "type": "worker1", "index": "0" } } """ devices = [ LogicalDevice('/job:worker1/task:0/device:TPU:0', 'TPU'), LogicalDevice('/job:worker1/task:0/device:TPU:1', 'TPU'), LogicalDevice('/job:worker1/task:0/device:GPU:0', 'GPU'), LogicalDevice('/job:worker1/task:0/device:GPU:1', 'GPU'), LogicalDevice('/job:worker2/task:1/device:TPU:2', 'TPU'), LogicalDevice('/job:worker2/task:2/device:TPU:3', 'TPU'), LogicalDevice('/job:worker2/task:3/device:GPU:2', 'GPU'), LogicalDevice('/job:worker2/task:4/device:GPU:3', 'GPU'), ] device_list = [ session._DeviceAttributes(d.name, d.device_type, 1024, 0) for d in devices ] mock_eager_list_devices.return_value = devices mock_list_devices.return_value = device_list resolver = TFConfigClusterResolver() # By default we read from TF_CONFIG self.assertEqual(resolver.num_accelerators(), {'TPU': 2, 'GPU': 2}) # Override still works when we want it to self.assertEqual( resolver.num_accelerators(task_type='worker2', task_id=3), {'GPU': 1})
def testGetDeviceDictAndCoresWithCPUsAndGPUs(self): device_names = [ '/job:tpu_worker/task:0/device:CPU:0', '/job:tpu_worker/task:1/device:CPU:0', '/job:tpu_worker/task:2/device:CPU:0', '/job:tpu_worker/task:3/device:CPU:0', '/job:tpu_worker/task:0/device:GPU:1', '/job:tpu_worker/task:1/device:GPU:1', '/job:tpu_worker/task:2/device:GPU:1', '/job:tpu_worker/task:3/device:GPU:1', ] device_list = [ session._DeviceAttributes( name, 'XLA', 1024, 0) for name in device_names ] device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores( device_list) self.assertEqual(num_cores, 0) self.assertEqual(device_dict, {})
def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices, mock_eager_list_devices): device_names = [ "/job:worker/task:0/device:TPU:0", "/job:worker/task:0/device:TPU:1", "/job:worker/task:0/device:TPU:2", "/job:worker/task:0/device:TPU:3", "/job:worker/task:0/device:GPU:0", "/job:worker/task:0/device:GPU:1", "/job:worker/task:0/device:GPU:2", "/job:worker/task:0/device:GPU:3", ] device_list = [ session._DeviceAttributes(name, name[26:29], 1024, 0) for name in device_names ] mock_eager_list_devices.return_value = device_names mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices, mock_eager_list_devices): device_names = [ "/job:worker/task:0/device:TPU:0", "/job:worker/task:0/device:TPU:1", "/job:worker/task:0/device:TPU:2", "/job:worker/task:0/device:TPU:3", "/job:worker/task:0/device:GPU:0", "/job:worker/task:0/device:GPU:1", "/job:worker/task:0/device:GPU:2", "/job:worker/task:0/device:GPU:3", ] device_list = [ session._DeviceAttributes(name, name[26:29], 1024, 0) for name in device_names ] mock_eager_list_devices.return_value = device_names mock_list_devices.return_value = device_list resolver = MockBaseClusterResolver() self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
def testGetDeviceDictAndCoresWithTPUs(self): device_names = [ '/job:tpu_worker/task:0/device:TPU:0', '/job:tpu_worker/task:1/device:TPU:1', '/job:tpu_worker/task:2/device:TPU:0', '/job:tpu_worker/task:3/device:TPU:1', '/job:tpu_worker/task:0/device:TPU:4', '/job:tpu_worker/task:1/device:TPU:5', '/job:tpu_worker/task:2/device:TPU:4', '/job:tpu_worker/task:3/device:TPU:5', ] device_list = [ session._DeviceAttributes( name, 'TPU', 1024, 0) for name in device_names ] device_details = tpu_cluster_resolver._get_device_dict_and_cores( device_list) self.assertEqual(device_details.total_cores, 8) self.assertEqual(device_details.device_map, {'0': ['0', '4'], '1': ['1', '5'], '2': ['0', '4'], '3': ['1', '5']})
def _query_tpu_system_metadata(master_address, cluster_def=None, query_topology=False): """Automatically detects the TPU system metadata in the system.""" tpu_core_count = 0 devices = [] device_dict = collections.defaultdict(list) if context.executing_eagerly(): device_names = context.list_devices() devices = [] # We want the output type to match in both eager and session mode for name in device_names: device_match = _DEVICE_TYPE_REGEX.match(name) device_type = 'CPU' if device_match: device_type = device_match.group(1) devices.append( session_lib._DeviceAttributes(name, device_type, 0, 0)) # pylint: disable=protected-access else: # TODO(b/120564445): Replace with standard library for retries. retry_count = 1 while True: logging.info( 'Querying Tensorflow master (%s) for TPU system metadata.', master_address) try: with ops.Graph().as_default(): with session_lib.Session( master_address, config=get_session_config_with_timeout( _PINGING_MASTER_TIMEOUT_IN_MS, cluster_def)) as sess: devices = sess.list_devices() break except errors.DeadlineExceededError: msg = ( 'Failed to connect to the Tensorflow master. The TPU worker may ' 'not be ready (still scheduling) or the Tensorflow master ' 'address is incorrect: got (%s).' % (master_address)) # TODO(xiejw): For local or grpc master we might not need retry logic # here. if retry_count <= _RETRY_TIMES: logging.warning('%s', msg) logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES) retry_count += 1 else: raise ValueError(msg) for device in devices: match = _TPU_DEVICE_REG.match(device.name) if match: host_id = match.group(1) core_id = match.group(2) device_dict[host_id].append(core_id) tpu_core_count += 1 num_of_cores_per_host = 0 if tpu_core_count: num_cores_per_host_set = set( [len(core_ids) for core_ids in device_dict.values()]) if len(num_cores_per_host_set) != 1: raise RuntimeError( 'TPU cores on each host is not same. This should not happen!. ' 'devices: {}'.format(devices)) num_of_cores_per_host = num_cores_per_host_set.pop() topology = None if query_topology: if not tpu_core_count: raise RuntimeError( 'Cannot find any TPU cores in the system (master address {}). ' 'This usually means the master address is incorrect or the ' 'TPU worker has some problems. Available devices: {}'.format( master_address, devices)) topology = _obtain_topology(master_address, cluster_def) # We sort the metadata devices so that downstream users get a sorted list # for creating mirrored variables correctly. def _sort_key(device): spec = tf_device.DeviceSpec.from_string(device.name) return (spec.job, spec.replica, spec.task, spec.device_type, spec.device_index) devices = tuple(sorted(devices, key=_sort_key)) metadata = _TPUSystemMetadata(num_cores=tpu_core_count, num_hosts=len(device_dict), num_of_cores_per_host=num_of_cores_per_host, topology=topology, devices=devices) if tpu_core_count: logging.info('Found TPU system:') logging.info('*** Num TPU Cores: %d', metadata.num_cores) logging.info('*** Num TPU Workers: %d', metadata.num_hosts) logging.info('*** Num TPU Cores Per Worker: %d', metadata.num_of_cores_per_host) for device in metadata.devices: logging.info('*** Available Device: %s', device) else: logging.info('Failed to find TPU: %s', metadata) return metadata