def testCpuMultiple(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) context.ensure_initialized() cpus = config.list_logical_devices('CPU') self.assertEqual(len(cpus), 2) with ops.device('/device:CPU:0'): a = constant_op.constant(1.0) self.evaluate(a) with ops.device('/device:CPU:1'): b = constant_op.constant(1.0) self.evaluate(b) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:CPU:2'): c = constant_op.constant(1.0) self.evaluate(c) # Ensure we can place ops on each of the device names for cpu in cpus: with ops.device(cpu.name): d = constant_op.constant(1.0) self.evaluate(d)
def testGpuInvalidConfig(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) for gpu in gpus: config.set_memory_growth(gpu, True) c = context.context().config self.assertTrue(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'memory limit'): config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) c = context.context().config self.assertFalse(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'virtual devices'): config.set_memory_growth(gpus[-1], False)
def _mimic_two_cpus(): cpus = config.list_physical_devices("CPU") config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration(), ])
def testGpuInvalidConfig(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) for gpu in gpus: config.set_memory_growth(gpu, True) c = context.context().config self.assertTrue(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'memory limit'): config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) c = context.context().config self.assertFalse(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'virtual devices'): config.set_memory_growth(gpus[-1], False)
def configure_virtual_cpus(): cpus = config.list_physical_devices('CPU') # Set 2 virtual CPUs config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ])
def testCpuMultiple(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) context.ensure_initialized() cpus = config.list_logical_devices('CPU') self.assertEqual(len(cpus), 2) with ops.device('/device:CPU:0'): a = constant_op.constant(1.0) self.evaluate(a) with ops.device('/device:CPU:1'): b = constant_op.constant(1.0) self.evaluate(b) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:CPU:2'): c = constant_op.constant(1.0) self.evaluate(c) # Ensure we can place ops on each of the device names for cpu in cpus: with ops.device(cpu.name): d = constant_op.constant(1.0) self.evaluate(d)
def worker_fn(): gpus = config.list_physical_devices('GPU') if gpus: # Set virtual GPU with memory limit of 64MB so that multiple worker # processes can share the physical GPU config.set_virtual_device_configuration( gpus[0], [context.VirtualDeviceConfiguration(64)]) for _ in range(100): worker_step_fn()
def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(TestMultiGPUModel, self).__init__(methodName) physical_devices = config.list_physical_devices('GPU') if len(physical_devices) == 1: # A GPU is available, simulate 2 instead. config.set_virtual_device_configuration( physical_devices[0], [ context.VirtualDeviceConfiguration(500), context.VirtualDeviceConfiguration(500) ])
def _ensure_context_initialized(self): gpus = config.list_physical_devices('GPU') if len(gpus) < 1: self.skipTest('Expected at least 1 GPU but found {} GPUs'.format( len(gpus))) config.set_virtual_device_configuration(gpus[0], [ context.VirtualDeviceConfiguration(1024), context.VirtualDeviceConfiguration(1024) ]) context.ensure_initialized()
def setUp(self): super(FunctionGradientsTest, self).setUp() cpus = config.list_physical_devices('CPU') # Set 4 virtual CPUs config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ])
def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(TestMultiGPUModel, self).__init__(methodName) gpu_devices = config.list_physical_devices('GPU') xla_gpu_devices = config.list_physical_devices('XLA_GPU') # NOTE: XLA devices don't support the set_virtual_device_configuration # codepaths. if len(gpu_devices) == 1 and not xla_gpu_devices: # A GPU is available, simulate 2 instead. config.set_virtual_device_configuration(gpu_devices[0], [ context.VirtualDeviceConfiguration(500), context.VirtualDeviceConfiguration(500) ])
def testGpuInvalidConfig(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) if len(gpus) > 1: # Assert if other GPUs were not configured config.set_memory_growth(gpus[0], True) with self.assertRaisesRegexp(ValueError, 'cannot differ'): c = context.context().config # If we limit visibility to GPU 0, growth is fine config.set_visible_devices(gpus[0], 'GPU') c = context.context().config self.assertTrue(c.gpu_options.allow_growth) # Default setting for second GPU is False and works if we set visibility config.set_visible_devices(gpus[1], 'GPU') c = context.context().config self.assertFalse(c.gpu_options.allow_growth) # Growth now fails because all the GPUs are visible and not the same config.set_visible_devices(gpus, 'GPU') with self.assertRaisesRegexp(ValueError, 'cannot differ'): c = context.context().config for gpu in gpus: config.set_memory_growth(gpu, True) c = context.context().config self.assertTrue(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'memory limit'): config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) c = context.context().config self.assertFalse(c.gpu_options.allow_growth) with self.assertRaisesRegexp(ValueError, 'virtual devices'): config.set_memory_growth(gpus[-1], False)
def testCollectiveReduceMinMax(self): gpus = config.list_physical_devices('GPU') if len(gpus) != 1: self.skipTest('Expected 1 GPU but found {} GPUs'.format(len(gpus))) config.set_virtual_device_configuration(gpus[0], [ context.VirtualDeviceConfiguration(1024), context.VirtualDeviceConfiguration(1024) ]) context.ensure_initialized() @def_function.function def run_all_reduce(group_key, instance_key, merge_op): group_size = 2 t0 = [1., 20., 3., 40., 5.] t1 = [10., 2., 30., 4., 50.] os.environ['NCCL_DEBUG'] = 'INFO' os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL' with ops.device('/GPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce(in0, group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') with ops.device('/GPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce(in1, group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') return c0, c1 for combination in [('Max', [10., 20., 30., 40., 50.]), ('Min', [1., 2., 3., 4., 5.])]: merge_op = combination[0] results = run_all_reduce(group_key=10, instance_key=20, merge_op=merge_op) expected = combination[1] for result in results: self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
def SetVirtualCpus(num_virtual_cpus): """Create virtual CPU devices if they haven't yet been created.""" if num_virtual_cpus < 1: raise ValueError('`num_virtual_cpus` must be at least 1 not %r' % (num_virtual_cpus,)) physical_devices = device_config.list_physical_devices('CPU') if not physical_devices: raise RuntimeError('No CPUs found') configs = device_config.get_virtual_device_configuration(physical_devices[0]) if configs is None: virtual_devices = [context.VirtualDeviceConfiguration() for _ in range(num_virtual_cpus)] device_config.set_virtual_device_configuration( physical_devices[0], virtual_devices) else: if len(configs) < num_virtual_cpus: raise RuntimeError('Already configured with %d < %d virtual CPUs' % (len(configs), num_virtual_cpus))
def set_virtual_cpus_to_at_least(num_virtual_cpus): """Create virtual CPU devices if they haven't yet been created.""" if num_virtual_cpus < 1: raise ValueError("`num_virtual_cpus` must be at least 1 not %r" % (num_virtual_cpus, )) physical_devices = config.list_physical_devices("CPU") if not physical_devices: raise RuntimeError("No CPUs found") configs = config.get_virtual_device_configuration(physical_devices[0]) if configs is None: virtual_devices = [ context.VirtualDeviceConfiguration() for _ in range(num_virtual_cpus) ] config.set_virtual_device_configuration(physical_devices[0], virtual_devices) else: if len(configs) < num_virtual_cpus: raise RuntimeError("Already configured with %d < %d virtual CPUs" % (len(configs), num_virtual_cpus))
def testVirtualGpu(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) self.assertEqual(len(config.get_virtual_device_configuration(gpus[-1])), 2) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus) + 1) for i in range(0, len(logical_gpus)): with ops.device('/device:GPU:' + str(i)): a = constant_op.constant(1.0) self.evaluate(a) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:GPU:' + str(len(logical_gpus))): a = constant_op.constant(1.0) self.evaluate(a)
def testVirtualGpu(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) self.assertIsNone(config.get_virtual_device_configuration(gpus[-1])) config.set_virtual_device_configuration(gpus[-1], [ context.VirtualDeviceConfiguration(memory_limit=10), context.VirtualDeviceConfiguration(memory_limit=10) ]) self.assertEqual(len(config.get_virtual_device_configuration(gpus[-1])), 2) logical_gpus = config.list_logical_devices('GPU') self.assertTrue(len(logical_gpus), len(gpus) + 1) for i in range(0, len(logical_gpus)): with ops.device('/device:GPU:' + str(i)): a = constant_op.constant(1.0) self.evaluate(a) with self.assertRaisesRegexp(RuntimeError, 'unknown device'): with ops.device('/device:GPU:' + str(len(logical_gpus))): a = constant_op.constant(1.0) self.evaluate(a)
def testCollectiveGroupSizeMismatch(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_virtual_device_configuration(cpus[0], [ context.VirtualDeviceConfiguration(), context.VirtualDeviceConfiguration() ]) context.ensure_initialized() @def_function.function def run_all_reduce(): group_key = 10 instance_key = 20 t0 = [1, 2, 3, 4] t1 = [5, 6, 7, 8] with ops.device('/CPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce(in0, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') with ops.device('/CPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce(in1, group_size=3, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') return c0, c1 with self.assertRaisesRegexp(errors.InternalError, 'but that group has size'): run_all_reduce()