def testRemote(self): gpus = config.list_logical_devices('GPU') self.assertNotEqual(len(gpus), 0) context.ensure_initialized() gpus = config.list_logical_devices('GPU') self.assertNotEqual(len(gpus), 0) for gpu in gpus: self.assertIsNotNone(gpu.name) context.ensure_initialized() job_name = 'test' cluster_def = cluster_pb2.ClusterDef() job_def = cluster_def.job.add() job_def.name = job_name job_def.tasks[0] = 'localhost:0' server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_def, job_name=job_name, task_index=0, protocol='grpc') context.set_server_def(server_def) gpus = config.list_logical_devices('GPU') for gpu in gpus: self.assertIsNotNone(gpu.name)
def testCpuMultiple(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) context.ensure_initialized() cpus = config.list_logical_devices('CPU') self.assertEqual(len(cpus), 2) with ops.device('/device:CPU:0'): a = constant_op.constant(1.0) self.evaluate(a) with ops.device('/device:CPU:1'): b = constant_op.constant(1.0) self.evaluate(b) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:CPU:2'): c = constant_op.constant(1.0) self.evaluate(c) # Ensure we can place ops on each of the device names for cpu in cpus: with ops.device(cpu.name): d = constant_op.constant(1.0) self.evaluate(d)
def testCpuMultiple(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() vcpus = config.list_logical_devices('CPU') self.assertEqual(len(vcpus), 2) with ops.device('/device:CPU:0'): a = constant_op.constant(1.0) self.evaluate(a) with ops.device('/device:CPU:1'): b = constant_op.constant(1.0) self.evaluate(b) with ops.device('/device:CPU:2'): c = constant_op.constant(1.0) self.evaluate(c) self.assertIn('CPU:0', c.device) # Ensure we can place ops on each of the device names for vcpu in vcpus: with ops.device(vcpu.name): d = constant_op.constant(1.0) self.evaluate(d) # Modifying the CPU configuration is not supported with self.assertRaisesRegex(RuntimeError, 'cannot be modified'): config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) # Setting the same CPU configuration is fine config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ])
def testVirtualGpu(self): config.set_soft_device_placement(False) gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_logical_device_configuration(gpus[-1])) config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ]) self.assertEqual(len(config.get_logical_device_configuration(gpus[-1])), 2) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus) + 1) for i in range(0, len(logical_gpus)): with ops.device('/device:GPU:' + str(i)): a = array_ops.identity(1.0) self.evaluate(a) with self.assertRaisesRegex(errors.InvalidArgumentError, 'Could not satisfy'): with ops.device('/device:GPU:' + str(len(logical_gpus))): a = array_ops.identity(1.0) self.evaluate(a) # Modifying the GPU configuration is not supported with self.assertRaisesRegex(RuntimeError, 'cannot be modified'): config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=20), context.LogicalDeviceConfiguration(memory_limit=20) ]) with self.assertRaisesRegex(RuntimeError, 'cannot be modified'): config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ]) # Setting the same GPU configuration is fine config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ])
def testVirtualGpu(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_logical_device_configuration(gpus[-1])) config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ]) self.assertEqual( len(config.get_logical_device_configuration(gpus[-1])), 2) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus) + 1) for i in range(0, len(logical_gpus)): with ops.device('/device:GPU:' + str(i)): a = constant_op.constant(1.0) self.evaluate(a) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:GPU:' + str(len(logical_gpus))): a = constant_op.constant(1.0) self.evaluate(a) # Modifying the GPU configuration is not supported with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'): config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=20), context.LogicalDeviceConfiguration(memory_limit=20) ]) with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'): config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ]) # Setting the same GPU configuration is fine config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ])
def _set_num_gpus(self): devices = config.list_logical_devices("GPU") per_worker_gpus = {} for d in devices: d_spec = tf_device.DeviceSpec.from_string(d.name) if d_spec.device_type == "GPU" and d_spec.job == "worker": # TODO(b/167894802): update if worker name is customizable job_spec = d_spec.replace(device_type=None, device_index=None) per_worker_gpus[job_spec] = per_worker_gpus.get(job_spec, 0) + 1 num_gpus = 0 for _, count in per_worker_gpus.items(): if num_gpus > 0 and count != num_gpus: raise ValueError("Mismatched number of GPUs per worker") num_gpus = count self._num_gpus_per_worker = num_gpus logging.info(f"Number of GPUs on workers: {self._num_gpus_per_worker}")
def testKeepLogicalDevice(self): # Cannot change logical device after the context initialization. context._reset_context() # pylint: disable=protected-access cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=False, num_workers=1) resolver = cluster_resolver_lib.SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), task_type='worker', task_id=0) gpus = tf_config.list_physical_devices('GPU') tf_config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(64), context.LogicalDeviceConfiguration(64), ]) collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # Since we create two logical GPUs out of the last GPU, there should be one # more logical GPUs than physical GPUs. self.assertLen(tf_config.list_logical_devices('GPU'), len(gpus) + 1) context._reset_context() # pylint: disable=protected-access
def testForcedCompile(self): """Tests whole-function forced-compilation. This test checks that stateless_random_* can be used in forced-compilation scenarios (e.g. TPU). The new version of stateless_random_* requires the intermediate tensor `alg` to be compile-time constant, so we need to check that this requirement is met. We use xla.compile instead of tf.function's jit_compile because the latter doesn't throw an error even if the compile-time-constant constraint is not met. """ if config.list_logical_devices('TPU'): self.skipTest( 'To accommodate OSS, xla.compile support for TPU is not ' 'linked in.') @def_function.function def f(x): return xla.compile( lambda x: stateless.stateless_random_normal([], seed=x), [x]) f([1, 2])
def testVirtualGpu(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) self.assertEqual(len(config.get_virtual_device_configuration(gpus[-1])), 2) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus) + 1) for i in range(0, len(logical_gpus)): with ops.device('/device:GPU:' + str(i)): a = constant_op.constant(1.0) self.evaluate(a) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:GPU:' + str(len(logical_gpus))): a = constant_op.constant(1.0) self.evaluate(a)
def testGpuNone(self): config.set_soft_device_placement(False) gpus = config.list_physical_devices('GPU') self.assertGreater(len(gpus), 0) cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) self.assertEqual(len(config.get_visible_devices('CPU')), 1) self.assertGreater(len(config.get_visible_devices('GPU')), 0) # get_visible_devices filters out XLA_* devices. list_logical_devices does # not, but we can't call it here because it initializes the devices and # calling set_visible_devices after that is disallowed. self.assertEqual(len(config.get_visible_devices('XLA_GPU')), 0) config.set_visible_devices(cpus[0]) self.assertEqual(len(config.get_visible_devices('CPU')), 1) self.assertEqual(len(config.get_visible_devices('GPU')), 0) self.assertEqual(len(config.list_logical_devices('XLA_GPU')), 0) with self.assertRaisesRegexp(errors.InvalidArgumentError, 'Could not satisfy'): with ops.device('/device:GPU:0'): a = array_ops.identity(1.0) self.evaluate(a) with self.assertRaisesRegexp(errors.InvalidArgumentError, 'Could not satisfy'): with ops.device('/device:XLA_GPU:0'): a = array_ops.identity(1.0) self.evaluate(a) # Modifying the visible devices is not supported with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'): config.set_visible_devices(gpus) # Setting the same visible devices is fine config.set_visible_devices(cpus[0])
def testGpuGrowth(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_memory_growth(gpus[-1])) for gpu in gpus: config.set_memory_growth(gpu, True) c = context.context().config self.assertTrue(c.gpu_options.allow_growth) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus)) # Modifying the GPU configuration is not supported with self.assertRaisesRegex(RuntimeError, 'cannot be modified'): for gpu in gpus: config.set_memory_growth(gpu, False) # Setting the same GPU configuration is fine for gpu in gpus: config.set_memory_growth(gpu, True)
def _make_device_specs( devices: Optional[List[str]] = None, device_type: Optional[str] = None ) -> Tuple[List[tf_device.DeviceSpec], str]: """Makes device specs from local devices names or number of global devices.""" if devices is None: if device_type is None: device_type = 'CPU' devices = [ tf_device.DeviceSpec.from_string(d.name) for d in tf_config.list_logical_devices(device_type) ] else: devices = [tf_device.DeviceSpec.from_string(d) for d in devices] if device_type is None: device_type = devices[0].device_type if device_type.upper() != devices[0].device_type.upper(): raise ValueError( f'Conflicting devices {str(devices)} and device_type {device_type}' ) return devices, device_type
def build(self, input_shape): # Note: most sparse optimizers do not have GPU kernels defined. When # building graphs, the placement algorithm is able to place variables on CPU # since it knows all kernels using the variable only exist on CPU. # When eager execution is enabled, the placement decision has to be made # right now. Checking for the presence of GPUs to avoid complicating the # TPU codepaths which can handle sparse optimizers. if context.executing_eagerly() and tf_config.list_logical_devices('GPU'): with ops.device('cpu:0'): self.embeddings = self.add_weight( shape=(self.input_dim, self.output_dim), initializer=self.embeddings_initializer, name='embeddings', regularizer=self.embeddings_regularizer, constraint=self.embeddings_constraint) else: self.embeddings = self.add_weight( shape=(self.input_dim, self.output_dim), initializer=self.embeddings_initializer, name='embeddings', regularizer=self.embeddings_regularizer, constraint=self.embeddings_constraint) self.built = True
def testWrapFuncDatasetDevice(self, device_type, dataset_reduce_fn): devices = config.list_logical_devices(device_type=device_type) if not devices: self.skipTest( 'Skip when {} is not detected by TF'.format(device_type)) @def_function.function def comp(): return dataset_reduce_fn(dataset_ops.Dataset.range(10)) graph = comp.get_concrete_function().graph def function_to_wrap(): with ops.device(devices[0].name): return graph_def_importer.import_graph_def( graph.as_graph_def()) with ops.device(devices[0].name): wrapped_noarg_fn = wrap_function.wrap_function(function_to_wrap, signature=[]) wrapped_noarg_fn()
def testForcedCompile(self): """Tests whole-function forced-compilation. This test checks that stateless_random_* can be used in forced-compilation scenarios (e.g. TPU). The new version of stateless_random_* requires the intermediate tensor `alg` to be compile-time constant, so we need to check that this requirement won't prevent `seed` from depending on variables. """ if config.list_logical_devices('TPU'): self.skipTest('To accommodate OSS, experimental_compile support for TPU ' 'is not linked in.') # GPU doesn't support int32 variables, so we use int64. v = variables.Variable([1, 2], dtype=dtypes.int64) @def_function.function(experimental_compile=True) def f(): key, counter = ( gen_stateless_random_ops_v2.stateless_random_get_key_counter( seed=math_ops.cast(v.read_value(), dtypes.int32))) alg = gen_stateless_random_ops_v2.stateless_random_get_alg() return gen_stateless_random_ops_v2.stateless_random_normal_v2( shape=[], key=key, counter=counter, alg=alg) f()
def testGpuNone(self): gpus = config.list_physical_devices('GPU') self.assertGreater(len(gpus), 0) cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) self.assertEqual(len(config.get_visible_devices('CPU')), 1) self.assertGreater(len(config.get_visible_devices('GPU')), 0) # get_visible_devices filters out XLA_* devices. list_logical_devices does # not, but we can't call it here because it initializes the devices and # calling set_visible_devices after that is disallowed. self.assertEqual(len(config.get_visible_devices('XLA_GPU')), 0) config.set_visible_devices(cpus[0]) self.assertEqual(len(config.get_visible_devices('CPU')), 1) self.assertEqual(len(config.get_visible_devices('GPU')), 0) self.assertEqual(len(config.list_logical_devices('XLA_GPU')), 0) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:GPU:0'): a = constant_op.constant(1.0) self.evaluate(a) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:XLA_GPU:0'): a = constant_op.constant(1.0) self.evaluate(a) # Modifying the visible devices is not supported with self.assertRaisesRegexp(RuntimeError, 'cannot be modified'): config.set_visible_devices(gpus) # Setting the same visible devices is fine config.set_visible_devices(cpus[0])
def all_local_devices(num_gpus=None): devices = config.list_logical_devices("GPU") if num_gpus is not None: devices = devices[:num_gpus] return devices or config.list_logical_devices("CPU")
def _query_tpu_system_metadata(master_address, cluster_def=None, query_topology=False): """Automatically detects the TPU system metadata in the system.""" tpu_core_count = 0 devices = [] device_dict = collections.defaultdict(list) if context.executing_eagerly(): logical_devices = config.list_logical_devices() # We want the output type to match in both eager and session mode devices = [session_lib._DeviceAttributes(device_util.canonicalize(d.name), # pylint: disable=protected-access d.device_type, 0, 0) for d in logical_devices] else: # TODO(b/120564445): Replace with standard library for retries. retry_count = 1 while True: logging.info('Querying Tensorflow master (%s) for TPU system metadata.', master_address) try: with ops.Graph().as_default(): with session_lib.Session( master_address, config=get_session_config_with_timeout( _PINGING_MASTER_TIMEOUT_IN_MS, cluster_def)) as sess: devices = sess.list_devices() break except errors.DeadlineExceededError: msg = ('Failed to connect to the Tensorflow master. The TPU worker may ' 'not be ready (still scheduling) or the Tensorflow master ' 'address is incorrect: got (%s).' % (master_address)) # TODO(xiejw): For local or grpc master we might not need retry logic # here. if retry_count <= _RETRY_TIMES: logging.warning('%s', msg) logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES) retry_count += 1 else: raise ValueError(msg) for device in devices: spec = tf_device.DeviceSpec.from_string(device.name) if spec.device_type == 'TPU': device_dict[spec.task].append(spec.device_index) tpu_core_count += 1 num_of_cores_per_host = 0 if tpu_core_count: num_cores_per_host_set = set( [len(core_ids) for core_ids in device_dict.values()]) if len(num_cores_per_host_set) != 1: raise RuntimeError( 'TPU cores on each host is not same. This should not happen!. ' 'devices: {}'.format(devices)) num_of_cores_per_host = num_cores_per_host_set.pop() topology = None if query_topology: if not tpu_core_count: raise RuntimeError( 'Cannot find any TPU cores in the system (master address {}). ' 'This usually means the master address is incorrect or the ' 'TPU worker has some problems. Available devices: {}'.format( master_address, devices)) topology = _obtain_topology(master_address, cluster_def) # We sort the metadata devices so that downstream users get a sorted list # for creating mirrored variables correctly. def _sort_key(device): spec = tf_device.DeviceSpec.from_string(device.name) return (spec.job, spec.replica, spec.task, spec.device_type, spec.device_index) devices = tuple(sorted(devices, key=_sort_key)) metadata = _TPUSystemMetadata( num_cores=tpu_core_count, num_hosts=len(device_dict), num_of_cores_per_host=num_of_cores_per_host, topology=topology, devices=devices) if tpu_core_count: logging.info('Found TPU system:') logging.info('*** Num TPU Cores: %d', metadata.num_cores) logging.info('*** Num TPU Workers: %d', metadata.num_hosts) logging.info('*** Num TPU Cores Per Worker: %d', metadata.num_of_cores_per_host) for device in metadata.devices: logging.info('*** Available Device: %s', device) else: logging.info('Failed to find TPU: %s', metadata) return metadata
def create_mesh(mesh_dims: Optional[List[Tuple[str, int]]] = None, mesh_name: str = '', devices: Optional[List[str]] = None, device_type: Optional[str] = None) -> layout.Mesh: """Creates a single-client mesh. If both `mesh_dims` and `devices` are specified, they must match each otehr. As a special case, when all arguments are missing, this creates a 1D CPU mesh with an empty name, assigning all available devices to that dimension. Args: mesh_dims: A list of (dim_name, dim_size) tuples. Defaults to a single batch-parallel dimension called 'x' using all devices. As a special case, a single-element mesh_dims whose dim_size is -1 also uses all devices. mesh_name: Name of the created mesh. Defaults to ''. devices: String representations of devices to use. This is the device part of tf.DeviceSpec, e.g. 'CPU:0'. Defaults to all available logical devices. device_type: If `devices` is missing, the type of devices to use. Defaults to 'CPU'. Returns: A single-client mesh created from specified or default arguments. """ if devices is None: if device_type is None: device_type = 'CPU' devices = [ tf_device.DeviceSpec.from_string(d.name) for d in tf_config.list_logical_devices(device_type) ] else: devices = [tf_device.DeviceSpec.from_string(d) for d in devices] if device_type is None: device_type = devices[0].device_type if device_type.upper() != devices[0].device_type.upper(): raise ValueError( f'Conflicting devices {str(devices)} and device_type {device_type}' ) local_spec = tf_device.DeviceSpec(job='localhost', replica=0, task=0) devices = [local_spec.make_merged_spec(d) for d in devices] if mesh_dims is None: mesh_dims = [('x', len(devices))] elif len(mesh_dims) == 1 and mesh_dims[0][1] == -1: # Replace -1 dim_size in a 1D mesh will the number of all devices. mesh_dims[0] = (mesh_dims[0][0], len(devices)) dim_names = [d[0] for d in mesh_dims] shape = [d[1] for d in mesh_dims] global_device_ids = np.arange(len(devices)).reshape(shape) local_device_ids = np.ravel(global_device_ids).tolist() mesh = layout.Mesh(dim_names=dim_names, global_device_ids=global_device_ids, local_device_ids=local_device_ids, local_devices=devices, mesh_name=mesh_name) _print_context(num_global_devices=len(devices), num_clients=1, client_id=0, device_type=devices[0].device_type, mesh=mesh) return mesh
def create_central_storage_strategy(): """Create a CentralStorageStrategy, using a GPU if it is available.""" compute_devices = ['cpu:0', 'gpu:0'] if ( tf_config.list_logical_devices('GPU')) else ['cpu:0'] return central_storage_strategy.CentralStorageStrategy( compute_devices, parameter_device='cpu:0')
def create_mirrored_strategy(): """Create a MirroredStrategy, using a GPU if it is available.""" if tf_config.list_logical_devices('GPU'): return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0']) else: return mirrored_strategy.MirroredStrategy(['cpu:0'])
def create_mirrored_strategy(): if tf_config.list_logical_devices('GPU'): return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0']) else: return mirrored_strategy.MirroredStrategy(['cpu:0'])