async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: if not self.enabled: return resource_spec = await get_resource_spec_from_container(container.backend_obj) if resource_spec is None: return if hasattr(alloc_map, 'apply_allocation'): alloc_map.apply_allocation({ SlotName('cuda.device'): resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ), }) else: alloc_map.allocations[SlotName('cuda.device')].update( resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ) )
def test_fraction_alloc_map_random_generated_allocations(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), }, allocation_strategy=FractionAllocationStrategy.FILL, ) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0') quantum = Decimal('.01') for _ in range(5): allocations = [] for _ in range(10): result = alloc_map.allocate({ SlotName('x'): Decimal(random.uniform(0, 0.1)).quantize(quantum, ROUND_DOWN), }) allocations.append(result) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] >= Decimal( '0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] >= Decimal( '0') for a in allocations: alloc_map.free(a) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0')
def check_clean(): assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a1')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a2')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a3')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][ DeviceId('a4')] == Decimal('0')
async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) # Docker does not return the original cpuset.... :( # We need to read our own records. resource_spec = await get_resource_spec_from_container( container.backend_obj) if resource_spec is None: return alloc_map.apply_allocation({ SlotName('cpu'): resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')], })
async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap(device_slots={ dev.device_id: DeviceSlotInfo(SlotTypes.COUNT, SlotName('cpu'), Decimal(dev.processing_units)) for dev in devices }, )
async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap(device_slots={ dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'), Decimal(dev.memory_size)) for dev in devices }, )
def read_from_string(cls, text: str) -> 'KernelResourceSpec': kvpairs = {} for line in text.split('\n'): if '=' not in line: continue key, val = line.strip().split('=', maxsplit=1) kvpairs[key] = val allocations = cast( MutableMapping[DeviceName, MutableMapping[SlotName, Mapping[DeviceId, Decimal]]], defaultdict(lambda: defaultdict(Decimal)), ) for key, val in kvpairs.items(): if key.endswith('_SHARES'): slot_name = SlotName(key[:-7].lower()) device_name = DeviceName(slot_name.split('.')[0]) per_device_alloc: MutableMapping[DeviceId, Decimal] = {} for entry in val.split(','): raw_dev_id, _, raw_alloc = entry.partition(':') if not raw_dev_id or not raw_alloc: continue dev_id = DeviceId(raw_dev_id) try: if known_slot_types.get(slot_name, 'count') == 'bytes': alloc = Decimal(BinarySize.from_str(raw_alloc)) else: alloc = Decimal(raw_alloc) except KeyError as e: log.warning( 'A previously launched container has ' 'unknown slot type: {}. Ignoring it.', e.args[0]) continue per_device_alloc[dev_id] = alloc allocations[device_name][slot_name] = per_device_alloc mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m] return cls( container_id=kvpairs.get('CID', 'unknown'), scratch_disk_size=BinarySize.finite_from_str( kvpairs['SCRATCH_SIZE']), allocations=dict(allocations), slots=ResourceSlot(json.loads(kvpairs['SLOTS'])), mounts=mounts, )
async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids: List[DeviceId] = [] if SlotName('cuda.devices') in device_alloc: device_ids.extend(device_alloc[SlotName('cuda.devices')].keys()) available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: proc = device.processing_units mem = BinarySize(device.memory_size) attached_devices.append({ # TODO: update common.types.DeviceModelInfo 'device_id': device.device_id, 'model_name': device.model_name, 'smp': proc, 'mem': mem, }) return attached_devices
async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) memory_limit = container.backend_obj['HostConfig']['Memory'] alloc_map.apply_allocation({ SlotName('mem'): { DeviceId('root'): memory_limit }, })
async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids = [*device_alloc[SlotName('mem')].keys()] available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: attached_devices.append({ 'device_id': device.device_id, 'model_name': '', 'data': {}, }) return attached_devices
async def resolve_occupied_slots( self, info: graphene.ResolveInfo) -> Mapping[str, Any]: """ Calculate the sum of occupied resource slots of all sub-kernels, and return the JSON-serializable object from the sum result. """ manager = info.context['dlmgr'] loader = manager.get_loader('ComputeContainer.by_session') containers = await loader.load(self.session_id) zero = ResourceSlot() return sum( (ResourceSlot( {SlotName(k): Decimal(v) for k, v in c.occupied_slots.items()}) for c in containers), start=zero, ).to_json()
def test_exclusive_resource_slots(): alloc_map = DiscretePropertyAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:1g.5gb-mig'), Decimal(1)), # noqa DeviceId('a1'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:1g.5gb-mig'), Decimal(1)), # noqa DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1)), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1)), DeviceId('a4'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:3g.20gb-mig'), Decimal(1)), # noqa }, exclusive_slot_types={ 'cuda.device:*-mig', 'cuda.device', 'cuda.shares' }, ) def check_clean(): assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a1')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device')][DeviceId( 'a2')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device')][DeviceId( 'a3')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][ DeviceId('a4')] == Decimal('0') with pytest.raises(InvalidResourceCombination): alloc_map.allocate({ SlotName('cuda.device'): Decimal('2'), SlotName('cuda.device:1g.5gb-mig'): Decimal('1'), }) check_clean()
def test_fraction_alloc_map_even_allocation_many_devices_2(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a4'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a5'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a6'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), DeviceId('a7'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')), }, allocation_strategy=FractionAllocationStrategy.EVENLY, ) result = alloc_map.allocate({SlotName('x'): Decimal('6')}) count_0 = 0 count_1 = 0 # NOTE: the even allocator favors the tail of device list when it fills up. # So we rely on the counting of desire per-device allocations instead of matching # the device index and the allocations. for idx in range(8): if alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('1.0'): count_1 += 1 if alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0'): count_0 += 1 assert count_0 == 2 assert count_1 == 6 alloc_map.free(result) for idx in range(8): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0')
def test_fraction_alloc_map_even_allocation_many_devices(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(2)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)), }, allocation_strategy=FractionAllocationStrategy.EVENLY, ) result = alloc_map.allocate({SlotName('x'): Decimal('6')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('3') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal('3') alloc_map.free(result) for idx in range(4): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.5)), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(2)), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)), DeviceId('a4'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)), DeviceId('a5'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(4)), DeviceId('a6'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(4.5)), DeviceId('a7'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)), DeviceId('a8'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)), }, allocation_strategy=FractionAllocationStrategy.EVENLY, ) result = alloc_map.allocate({SlotName('x'): Decimal('6')}, min_memory=Decimal('2.5')) assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal('3') assert alloc_map.allocations[SlotName('x')][DeviceId('a4')] == Decimal('3') alloc_map.free(result) for idx in range(9): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('11')}, min_memory=Decimal('0.84')) assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '2.75') assert alloc_map.allocations[SlotName('x')][DeviceId('a4')] == Decimal( '2.75') assert alloc_map.allocations[SlotName('x')][DeviceId('a5')] == Decimal( '2.75') assert alloc_map.allocations[SlotName('x')][DeviceId('a5')] == Decimal( '2.75') alloc_map.free(result) for idx in range(9): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0')
def test_fraction_alloc_map_even_allocation_fractions(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.8')), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.75')), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.7')), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')), DeviceId('a4'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.0')), }, allocation_strategy=FractionAllocationStrategy.EVENLY, ) result = alloc_map.allocate({SlotName('x'): Decimal('2.31')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.67') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.67') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.67') assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '0.3') alloc_map.free(result) for idx in range(4): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('2')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.67') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.67') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.66') alloc_map.free(result) for idx in range(3): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0')
def test_fraction_alloc_map_even_allocation(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.05)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.1)), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.2)), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.3)), DeviceId('a4'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.0)), }, allocation_strategy=FractionAllocationStrategy.EVENLY, ) for idx in range(5): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') with pytest.raises(InsufficientResource): alloc_map.allocate({ SlotName('x'): Decimal('0.66'), }) with pytest.raises(InsufficientResource): alloc_map.allocate({ SlotName('x'): Decimal('0.06'), }, min_memory=Decimal(0.6)) for _ in range(20): alloc_map.allocate({ SlotName('x'): Decimal('0.01'), }) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.05') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.1') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.05') alloc_map.free({ SlotName('x'): { DeviceId('a0'): Decimal('0.05'), DeviceId('a1'): Decimal('0.1'), DeviceId('a2'): Decimal('0.05') } }) for idx in range(0): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('0.2')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.2') alloc_map.free(result) assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('0.2')}, min_memory=Decimal('0.25')) assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '0.2') alloc_map.free(result) for idx in range(5): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('0.5')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.2') assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '0.3') alloc_map.free(result) for idx in range(5): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('0.65')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.05') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.1') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.2') assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '0.3') alloc_map.free(result) for idx in range(5): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({SlotName('x'): Decimal('0.6')}, min_memory=Decimal('0.1')) assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.1') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.2') assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal( '0.3') alloc_map.free(result) for idx in range(5): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') alloc_map = FractionAllocMap(device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.9')), }, ) result = alloc_map.allocate({SlotName('x'): Decimal('1')}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.3') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.3') assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal( '0.4')
class MemoryPlugin(AbstractComputePlugin): """ Represents the main memory. When collecting statistics, it also measures network and I/O usage in addition to the memory usage. """ config_watch_enabled = False key = DeviceName('mem') slot_types = [(SlotName('mem'), SlotTypes.BYTES)] async def init(self, context: Any = None) -> None: pass async def cleanup(self) -> None: pass async def update_plugin_config( self, new_plugin_config: Mapping[str, Any]) -> None: pass async def list_devices(self) -> Collection[MemoryDevice]: # TODO: support NUMA? memory_size = psutil.virtual_memory().total return [ MemoryDevice( device_id=DeviceId('root'), hw_location='root', numa_node=0, memory_size=memory_size, processing_units=0, ) ] async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)), } def get_version(self) -> str: return __version__ async def extra_info(self) -> Mapping[str, str]: return {} async def gather_node_measures( self, ctx: StatContext) -> Sequence[NodeMeasurement]: _mstat = psutil.virtual_memory() total_mem_used_bytes = Decimal(_mstat.total - _mstat.available) total_mem_capacity_bytes = Decimal(_mstat.total) _nstat = psutil.net_io_counters() net_rx_bytes = _nstat.bytes_recv net_tx_bytes = _nstat.bytes_sent def get_disk_stat(): pruned_disk_types = frozenset(['squashfs', 'vfat', 'tmpfs']) total_disk_usage = Decimal(0) total_disk_capacity = Decimal(0) per_disk_stat = {} for disk_info in psutil.disk_partitions(): if disk_info.fstype not in pruned_disk_types: dstat = os.statvfs(disk_info.mountpoint) disk_usage = Decimal(dstat.f_frsize * (dstat.f_blocks - dstat.f_bavail)) disk_capacity = Decimal(dstat.f_frsize * dstat.f_blocks) per_disk_stat[disk_info.device] = Measurement( disk_usage, disk_capacity) total_disk_usage += disk_usage total_disk_capacity += disk_capacity return total_disk_usage, total_disk_capacity, per_disk_stat loop = current_loop() total_disk_usage, total_disk_capacity, per_disk_stat = \ await loop.run_in_executor(None, get_disk_stat) return [ NodeMeasurement( MetricKey('mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_node=Measurement(total_mem_used_bytes, total_mem_capacity_bytes), per_device={ DeviceId('root'): Measurement(total_mem_used_bytes, total_mem_capacity_bytes) }, ), NodeMeasurement( MetricKey('disk'), MetricTypes.USAGE, unit_hint='bytes', per_node=Measurement(total_disk_usage, total_disk_capacity), per_device=per_disk_stat, ), NodeMeasurement( MetricKey('net_rx'), MetricTypes.RATE, unit_hint='bps', current_hook=lambda metric: metric.stats.rate, per_node=Measurement(Decimal(net_rx_bytes)), per_device={ DeviceId('node'): Measurement(Decimal(net_rx_bytes)) }, ), NodeMeasurement( MetricKey('net_tx'), MetricTypes.RATE, unit_hint='bps', current_hook=lambda metric: metric.stats.rate, per_node=Measurement(Decimal(net_tx_bytes)), per_device={ DeviceId('node'): Measurement(Decimal(net_tx_bytes)) }, ), ] async def gather_container_measures(self, ctx: StatContext, container_ids: Sequence[str]) \ -> Sequence[ContainerMeasurement]: def get_scratch_size(container_id: str) -> int: for kernel_id, info in ctx.agent.kernel_registry.items(): if info['container_id'] == container_id: break else: return 0 work_dir = ctx.agent.local_config['container'][ 'scratch-root'] / str(kernel_id) / 'work' total_size = 0 for path in work_dir.rglob('*'): if path.is_symlink(): total_size += path.lstat().st_size elif path.is_file(): total_size += path.stat().st_size return total_size async def sysfs_impl(container_id): mem_prefix = f'/sys/fs/cgroup/memory/docker/{container_id}/' io_prefix = f'/sys/fs/cgroup/blkio/docker/{container_id}/' try: mem_cur_bytes = read_sysfs( mem_prefix + 'memory.usage_in_bytes', int) io_stats = Path(io_prefix + 'blkio.throttle.io_service_bytes').read_text() # example data: # 8:0 Read 13918208 # 8:0 Write 0 # 8:0 Sync 0 # 8:0 Async 13918208 # 8:0 Total 13918208 # Total 13918208 io_read_bytes = 0 io_write_bytes = 0 for line in io_stats.splitlines(): if line.startswith('Total '): continue dev, op, nbytes = line.strip().split() if op == 'Read': io_read_bytes += int(nbytes) elif op == 'Write': io_write_bytes += int(nbytes) except IOError as e: log.warning( 'cannot read stats: sysfs unreadable for container {0}\n{1!r}', container_id[:7], e) return None loop = current_loop() scratch_sz = await loop.run_in_executor(None, get_scratch_size, container_id) return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz async def api_impl(container_id): container = DockerContainer(ctx.agent.docker, id=container_id) ret = await fetch_api_stats(container) if ret is None: return None mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0) io_read_bytes = 0 io_write_bytes = 0 for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive', []): if item['op'] == 'Read': io_read_bytes += item['value'] elif item['op'] == 'Write': io_write_bytes += item['value'] loop = current_loop() scratch_sz = await loop.run_in_executor(None, get_scratch_size, container_id) return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz if ctx.mode == StatModes.CGROUP: impl = sysfs_impl elif ctx.mode == StatModes.DOCKER: impl = api_impl else: raise RuntimeError("should not reach here") per_container_mem_used_bytes = {} per_container_io_read_bytes = {} per_container_io_write_bytes = {} per_container_io_scratch_size = {} tasks = [] for cid in container_ids: tasks.append(asyncio.ensure_future(impl(cid))) results = await asyncio.gather(*tasks) for cid, result in zip(container_ids, results): if result is None: continue per_container_mem_used_bytes[cid] = Measurement(Decimal(result[0])) per_container_io_read_bytes[cid] = Measurement(Decimal(result[1])) per_container_io_write_bytes[cid] = Measurement(Decimal(result[2])) per_container_io_scratch_size[cid] = Measurement(Decimal( result[3])) return [ ContainerMeasurement( MetricKey('mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_container=per_container_mem_used_bytes, ), ContainerMeasurement( MetricKey('io_read'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'rate'}), per_container=per_container_io_read_bytes, ), ContainerMeasurement( MetricKey('io_write'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'rate'}), per_container=per_container_io_write_bytes, ), ContainerMeasurement( MetricKey('io_scratch_size'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_container=per_container_io_scratch_size, ), ] async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap(device_slots={ dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'), Decimal(dev.memory_size)) for dev in devices }, ) async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: return [] async def generate_docker_args( self, docker: Docker, device_alloc, ) -> Mapping[str, Any]: memory = sum(device_alloc['mem'].values()) return { 'HostConfig': { 'MemorySwap': int(memory), # prevent using swap! 'Memory': int(memory), } } async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: assert isinstance(alloc_map, DiscretePropertyAllocMap) memory_limit = container.backend_obj['HostConfig']['Memory'] alloc_map.apply_allocation({ SlotName('mem'): { DeviceId('root'): memory_limit }, }) async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids = [*device_alloc[SlotName('mem')].keys()] available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: attached_devices.append({ 'device_id': device.device_id, 'model_name': '', 'data': {}, }) return attached_devices
""" Common definitions/constants used throughout the manager. """ from typing import Final from ai.backend.common.types import SlotName, SlotTypes INTRINSIC_SLOTS: Final = { SlotName('cpu'): SlotTypes('count'), SlotName('mem'): SlotTypes('bytes'), }
async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)), }
async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('cuda.device'): Decimal(len(devices)), }
async def _get_resource_slots(self): raw_data = await self.etcd.get_prefix_dict('config/resource_slots') return {SlotName(k): SlotTypes(v) for k, v in raw_data.items()}
def test_quantum_size(alloc_strategy): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)), # noqa DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)), # noqa }, quantum_size=Decimal("0.25"), allocation_strategy=alloc_strategy, ) result = alloc_map.allocate({ SlotName('x'): Decimal("0.5"), }) assert sum(alloc_map.allocations[SlotName('x')].values()) == Decimal("0.5") alloc_map.free(result) result = alloc_map.allocate({ SlotName('x'): Decimal("1.5"), }) assert sum(alloc_map.allocations[SlotName('x')].values()) == Decimal("1.5") if alloc_strategy == FractionAllocationStrategy.EVENLY: assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( "0.75") assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( "0.75") else: assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( "1.00") assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( "0.50") alloc_map.free(result) # inputs are not multiple of 0.25 with pytest.raises(NotMultipleOfQuantum): alloc_map.allocate({ SlotName('x'): Decimal("0.52"), }) with pytest.raises(NotMultipleOfQuantum): alloc_map.allocate({ SlotName('x'): Decimal("0.42"), }) with pytest.raises(NotMultipleOfQuantum): alloc_map.allocate({ SlotName('x'): Decimal("3.99"), }) if alloc_strategy == FractionAllocationStrategy.EVENLY: # input IS multiple of 0.25 but the CALCULATED allocations are not multiple of 0.25 with pytest.raises(InsufficientResource, match="multiple-of-quantum"): alloc_map.allocate({ SlotName('x'): Decimal("1.75"), # divided to 0.88 and 0.87 }) else: # In this case, it satisfies the quantum condition, because the capacity of devices are # multiples of the quantum. alloc_map.allocate({ SlotName('x'): Decimal("1.75"), }) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( "1.00") assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( "0.75") # So let's change the situation. alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)), # noqa DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)), # noqa }, quantum_size=Decimal("0.3"), allocation_strategy=alloc_strategy, ) with pytest.raises(NotMultipleOfQuantum): alloc_map.allocate({ SlotName('x'): Decimal("0.5"), }) with pytest.raises(InsufficientResource, match="multiple-of-quantum"): alloc_map.allocate({ SlotName('x'): Decimal("1.2"), })
def test_discrete_alloc_map_large_number(): alloc_map = DiscretePropertyAllocMap(device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(100)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(100)), }, ) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 0 assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 0 result = alloc_map.allocate({ SlotName('x'): Decimal('130'), }) assert result[SlotName('x')][DeviceId('a0')] == 100 assert result[SlotName('x')][DeviceId('a1')] == 30 assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 100 assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 30 with pytest.raises(InsufficientResource): alloc_map.allocate({ SlotName('x'): Decimal('71'), }) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 100 assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 30 alloc_map.free(result) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 0 assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 0
async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('cpu'): Decimal(sum(dev.processing_units for dev in devices)), }
def test_heterogeneous_resource_slots_with_fractional_alloc_map(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:1g.5gb-mig'), Decimal(1)), # noqa DeviceId('a1'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:1g.5gb-mig'), Decimal(1)), # noqa DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.shares'), Decimal('1.0')), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.shares'), Decimal('1.0')), DeviceId('a4'): DeviceSlotInfo(SlotTypes.UNIQUE, SlotName('cuda.device:3g.20gb-mig'), Decimal(1)), # noqa }, exclusive_slot_types={ 'cuda.device:*-mig', 'cuda.device', 'cuda.shares' }, allocation_strategy=FractionAllocationStrategy.FILL, ) def check_clean(): assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][ DeviceId('a1')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a2')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a3')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][ DeviceId('a4')] == Decimal('0') check_clean() # check allocation of non-unique slots result = alloc_map.allocate({SlotName('cuda.shares'): Decimal('2.0')}) assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a0')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a1')] == Decimal('0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a2')] == Decimal('1.0') assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId( 'a3')] == Decimal('1.0') assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][DeviceId( 'a4')] == Decimal('0') alloc_map.free(result) check_clean() with pytest.raises(InsufficientResource): alloc_map.allocate({SlotName('cuda.shares'): Decimal('2.5')}) check_clean() # allocating zero means no-op. alloc_map.allocate({SlotName('cuda.device:1g.5gb-mig'): Decimal('0')}) check_clean() # any allocation request for unique slots should specify the amount 1. with pytest.raises(InvalidResourceArgument): alloc_map.allocate( {SlotName('cuda.device:1g.5gb-mig'): Decimal('0.3')}) with pytest.raises(InvalidResourceArgument): alloc_map.allocate( {SlotName('cuda.device:1g.5gb-mig'): Decimal('1.5')}) check_clean() # test alloaction of unique slots result1 = alloc_map.allocate( {SlotName('cuda.device:1g.5gb-mig'): Decimal('1')}) assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a0')] == Decimal('1') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a1')] == Decimal('0') result2 = alloc_map.allocate( {SlotName('cuda.device:1g.5gb-mig'): Decimal('1')}) assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a0')] == Decimal('1') assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId( 'a1')] == Decimal('1') with pytest.raises(InsufficientResource): alloc_map.allocate({SlotName('cuda.device:1g.5gb-mig'): Decimal('1')}) alloc_map.free(result1) alloc_map.free(result2) check_clean()
def test_fraction_alloc_map_iteration(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), }, allocation_strategy=FractionAllocationStrategy.FILL, quantum_size=Decimal("0.00001")) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0') for _ in range(1000): alloc_map.allocate({ SlotName('x'): Decimal('0.00001'), }) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.005') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.005') alloc_map.free({SlotName('x'): {DeviceId('a0'): Decimal('0.00001')}}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '0.00499') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.005') for _ in range(499): alloc_map.free({SlotName('x'): {DeviceId('a0'): Decimal('0.00001')}}) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.005')
def test_fraction_alloc_map(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), }, allocation_strategy=FractionAllocationStrategy.FILL, ) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0') result = alloc_map.allocate({ SlotName('x'): Decimal('1.5'), }) assert result[SlotName('x')][DeviceId('a0')] == Decimal('1.0') assert result[SlotName('x')][DeviceId('a1')] == Decimal('0.5') assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '1.0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.5') with pytest.raises(InsufficientResource): alloc_map.allocate({ SlotName('x'): Decimal('1.5'), }) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal( '1.0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal( '0.5') alloc_map.free(result) assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0') assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0')
class CUDAPlugin(AbstractComputePlugin): config_watch_enabled = False key = DeviceName('cuda') slot_types: Sequence[Tuple[SlotName, SlotTypes]] = ( (SlotName('cuda.device'), SlotTypes('count')), ) nvdocker_version: Tuple[int, ...] = (0, 0, 0) docker_version: Tuple[int, ...] = (0, 0, 0) device_mask: Sequence[DeviceId] = [] enabled: bool = True async def init(self, context: Any = None) -> None: rx_triple_version = re.compile(r'(\d+\.\d+\.\d+)') # Check nvidia-docker and docker versions try: proc = await asyncio.create_subprocess_exec( 'nvidia-docker', 'version', '-f', '{{json .}}', stdout=asyncio.subprocess.PIPE, ) stdout, _ = await proc.communicate() lines = stdout.decode().splitlines() except FileNotFoundError: log.warning('nvidia-docker is not installed.') log.info('CUDA acceleration is disabled.') self.enabled = False return m = rx_triple_version.search(lines[0]) if m: self.nvdocker_version = tuple(map(int, m.group(1).split('.'))) else: log.error('could not detect nvidia-docker version!') log.info('CUDA acceleration is disabled.') self.enabled = False return docker_version_data = json.loads(lines[1]) m = rx_triple_version.search(docker_version_data['Server']['Version']) if m: self.docker_version = tuple(map(int, m.group(1).split('.'))) else: log.error('could not detect docker version!') log.info('CUDA acceleration is disabled.') self.enabled = False return raw_device_mask = self.plugin_config.get('device_mask') if raw_device_mask is not None: self.device_mask = [ *map(lambda dev_id: DeviceId(dev_id), raw_device_mask.split(',')) ] try: detected_devices = await self.list_devices() log.info('detected devices:\n' + pformat(detected_devices)) log.info('nvidia-docker version: {}', self.nvdocker_version) log.info('CUDA acceleration is enabled.') except ImportError: log.warning('could not load the CUDA runtime library.') log.info('CUDA acceleration is disabled.') self.enabled = False except RuntimeError as e: log.warning('CUDA init error: {}', e) log.info('CUDA acceleration is disabled.') self.enabled = False async def cleanup(self) -> None: pass async def update_plugin_config( self, new_plugin_config: Mapping[str, Any], ) -> None: pass async def list_devices(self) -> Collection[CUDADevice]: if not self.enabled: return [] all_devices = [] num_devices = libcudart.get_device_count() for dev_id in map(lambda idx: DeviceId(str(idx)), range(num_devices)): if dev_id in self.device_mask: continue raw_info = libcudart.get_device_props(int(dev_id)) sysfs_node_path = "/sys/bus/pci/devices/" \ f"{raw_info['pciBusID_str'].lower()}/numa_node" node: Optional[int] try: node = int(Path(sysfs_node_path).read_text().strip()) except OSError: node = None dev_uuid, raw_dev_uuid = None, raw_info.get('uuid', None) if raw_dev_uuid is not None: dev_uuid = str(uuid.UUID(bytes=raw_dev_uuid)) else: dev_uuid = '00000000-0000-0000-0000-000000000000' dev_info = CUDADevice( device_id=dev_id, hw_location=raw_info['pciBusID_str'], numa_node=node, memory_size=raw_info['totalGlobalMem'], processing_units=raw_info['multiProcessorCount'], model_name=raw_info['name'], uuid=dev_uuid, ) all_devices.append(dev_info) return all_devices async def available_slots(self) -> Mapping[SlotName, Decimal]: devices = await self.list_devices() return { SlotName('cuda.device'): Decimal(len(devices)), } def get_version(self) -> str: return __version__ async def extra_info(self) -> Mapping[str, Any]: if self.enabled: try: return { 'cuda_support': True, 'nvidia_version': libnvml.get_driver_version(), 'cuda_version': '{0[0]}.{0[1]}'.format(libcudart.get_version()), } except ImportError: log.warning('extra_info(): NVML/CUDA runtime library is not found') except LibraryError as e: log.warning('extra_info(): {!r}', e) return { 'cuda_support': False, } async def gather_node_measures( self, ctx: StatContext, ) -> Sequence[NodeMeasurement]: dev_count = 0 mem_avail_total = 0 mem_used_total = 0 mem_stats = {} util_total = 0 util_stats = {} if self.enabled: try: dev_count = libnvml.get_device_count() for dev_id in map(lambda idx: DeviceId(str(idx)), range(dev_count)): if dev_id in self.device_mask: continue dev_stat = libnvml.get_device_stats(int(dev_id)) mem_avail_total += dev_stat.mem_total mem_used_total += dev_stat.mem_used mem_stats[dev_id] = Measurement(Decimal(dev_stat.mem_used), Decimal(dev_stat.mem_total)) util_total += dev_stat.gpu_util util_stats[dev_id] = Measurement(Decimal(dev_stat.gpu_util), Decimal(100)) except ImportError: log.warning('gather_node_measure(): NVML library is not found') except LibraryError as e: log.warning('gather_node_measure(): {!r}', e) return [ NodeMeasurement( MetricKey('cuda_mem'), MetricTypes.USAGE, unit_hint='bytes', stats_filter=frozenset({'max'}), per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)), per_device=mem_stats, ), NodeMeasurement( MetricKey('cuda_util'), MetricTypes.USAGE, unit_hint='percent', stats_filter=frozenset({'avg', 'max'}), per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)), per_device=util_stats, ), ] async def gather_container_measures( self, ctx: StatContext, container_ids: Sequence[str], ) -> Sequence[ContainerMeasurement]: return [] async def create_alloc_map(self) -> AbstractAllocMap: devices = await self.list_devices() return DiscretePropertyAllocMap( device_slots={ dev.device_id: ( DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1)) ) for dev in devices }, ) async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]: return [] async def generate_docker_args( self, docker: aiodocker.Docker, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Mapping[str, Any]: if not self.enabled: return {} assigned_device_ids = [] for slot_type, per_device_alloc in device_alloc.items(): for device_id, alloc in per_device_alloc.items(): if alloc > 0: assigned_device_ids.append(device_id) if self.nvdocker_version[0] == 1: timeout = aiohttp.ClientTimeout(total=3) async with aiohttp.ClientSession(raise_for_status=True, timeout=timeout) as sess: try: nvdocker_url = 'http://localhost:3476/docker/cli/json' async with sess.get(nvdocker_url) as resp: nvidia_params = await resp.json() except aiohttp.ClientError: raise RuntimeError('NVIDIA Docker plugin is not available.') volumes = await docker.volumes.list() existing_volumes = set(vol['Name'] for vol in volumes['Volumes']) required_volumes = set(vol.split(':')[0] for vol in nvidia_params['Volumes']) missing_volumes = required_volumes - existing_volumes binds = [] for vol_name in missing_volumes: for vol_param in nvidia_params['Volumes']: if vol_param.startswith(vol_name + ':'): _, _, permission = vol_param.split(':') driver = nvidia_params['VolumeDriver'] await docker.volumes.create({ 'Name': vol_name, 'Driver': driver, }) for vol_name in required_volumes: for vol_param in nvidia_params['Volumes']: if vol_param.startswith(vol_name + ':'): _, mount_pt, permission = vol_param.split(':') binds.append('{}:{}:{}'.format( vol_name, mount_pt, permission)) devices = [] for dev in nvidia_params['Devices']: m = re.search(r'^/dev/nvidia(\d+)$', dev) if m is None: # Always add non-GPU device files required by the driver. # (e.g., nvidiactl, nvidia-uvm, ... etc.) devices.append(dev) continue device_id = m.group(1) if device_id not in assigned_device_ids: continue devices.append(dev) devices = [{ 'PathOnHost': dev, 'PathInContainer': dev, 'CgroupPermissions': 'mrw', } for dev in devices] return { 'HostConfig': { 'Binds': binds, 'Devices': devices, }, } elif self.nvdocker_version[0] == 2: device_list_str = ','.join(sorted(assigned_device_ids)) if self.docker_version >= (19, 3, 0): docker_config: Dict[str, Any] = {} if assigned_device_ids: docker_config.update({ 'HostConfig': { 'DeviceRequests': [ { "Driver": "nvidia", "DeviceIDs": assigned_device_ids, # "all" does not work here "Capabilities": [ ["utility", "compute", "video", "graphics", "display"] ], }, ], }, }) return docker_config else: return { 'HostConfig': { 'Runtime': 'nvidia', }, 'Env': [ f"NVIDIA_VISIBLE_DEVICES={device_list_str}", ], } else: raise RuntimeError('BUG: should not be reached here!') async def get_attached_devices( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Sequence[DeviceModelInfo]: device_ids: List[DeviceId] = [] if SlotName('cuda.devices') in device_alloc: device_ids.extend(device_alloc[SlotName('cuda.devices')].keys()) available_devices = await self.list_devices() attached_devices: List[DeviceModelInfo] = [] for device in available_devices: if device.device_id in device_ids: proc = device.processing_units mem = BinarySize(device.memory_size) attached_devices.append({ # TODO: update common.types.DeviceModelInfo 'device_id': device.device_id, 'model_name': device.model_name, 'smp': proc, 'mem': mem, }) return attached_devices async def restore_from_container( self, container: Container, alloc_map: AbstractAllocMap, ) -> None: if not self.enabled: return resource_spec = await get_resource_spec_from_container(container.backend_obj) if resource_spec is None: return if hasattr(alloc_map, 'apply_allocation'): alloc_map.apply_allocation({ SlotName('cuda.device'): resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ), }) else: alloc_map.allocations[SlotName('cuda.device')].update( resource_spec.allocations.get( DeviceName('cuda'), {} ).get( SlotName('cuda.device'), {} ) ) async def generate_resource_data( self, device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]], ) -> Mapping[str, str]: data: MutableMapping[str, str] = {} if not self.enabled: return data active_device_id_set: Set[DeviceId] = set() for slot_type, per_device_alloc in device_alloc.items(): for dev_id, alloc in per_device_alloc.items(): if alloc > 0: active_device_id_set.add(dev_id) active_device_ids = sorted(active_device_id_set, key=lambda v: int(v)) data['CUDA_GLOBAL_DEVICE_IDS'] = ','.join( f'{local_idx}:{global_id}' for local_idx, global_id in enumerate(active_device_ids)) return data
async def detect_resources( etcd: AsyncEtcd, local_config: Mapping[str, Any], ) -> Tuple[Mapping[DeviceName, AbstractComputePlugin], Mapping[SlotName, Decimal]]: """ Detect available computing resource of the system. It also loads the accelerator plugins. limit_cpus, limit_gpus are deprecated. """ reserved_slots = { 'cpu': local_config['resource']['reserved-cpu'], 'mem': local_config['resource']['reserved-mem'], 'disk': local_config['resource']['reserved-disk'], } slots: MutableMapping[SlotName, Decimal] = {} compute_device_types: MutableMapping[DeviceName, AbstractComputePlugin] = {} # Initialize intrinsic plugins by ourselves. from .intrinsic import CPUPlugin, MemoryPlugin compute_plugin_ctx = ComputePluginContext( etcd, local_config, ) await compute_plugin_ctx.init() if 'cpu' not in compute_plugin_ctx.plugins: cpu_config = await etcd.get_prefix('config/plugins/cpu') cpu_plugin = CPUPlugin(cpu_config, local_config) compute_plugin_ctx.attach_intrinsic_device(cpu_plugin) if 'mem' not in compute_plugin_ctx.plugins: memory_config = await etcd.get_prefix('config/plugins/memory') memory_plugin = MemoryPlugin(memory_config, local_config) compute_plugin_ctx.attach_intrinsic_device(memory_plugin) for plugin_name, plugin_instance in compute_plugin_ctx.plugins.items(): if not all((invalid_name := sname, sname.startswith(f'{plugin_instance.key}.'))[1] for sname, _ in plugin_instance.slot_types if sname not in {'cpu', 'mem'}): raise InitializationError( "Slot types defined by an accelerator plugin must be prefixed " "by the plugin's key.", invalid_name, # noqa: F821 plugin_instance.key, ) if plugin_instance.key in compute_device_types: raise InitializationError( f"A plugin defining the same key '{plugin_instance.key}' already exists. " "You may need to uninstall it first.") compute_device_types[plugin_instance.key] = plugin_instance for key, computer in compute_device_types.items(): known_slot_types.update( computer.slot_types) # type: ignore # (only updated here!) resource_slots = await computer.available_slots() for sname, sval in resource_slots.items(): slots[sname] = Decimal(max(0, sval - reserved_slots.get(sname, 0))) if slots[sname] <= 0 and sname in (SlotName('cpu'), SlotName('mem')): raise InitializationError( f"The resource slot '{sname}' is not sufficient (zero or below zero). " "Try to adjust the reserved resources or use a larger machine." ) log.info('Resource slots: {!r}', slots) log.info('Slot types: {!r}', known_slot_types) return compute_device_types, slots
def test_fraction_alloc_map_many_device(): alloc_map = FractionAllocMap( device_slots={ DeviceId('a0'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a1'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a2'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a3'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a4'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a5'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a6'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), DeviceId('a7'): DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)), }, allocation_strategy=FractionAllocationStrategy.FILL, ) for idx in range(8): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0') result = alloc_map.allocate({ SlotName('x'): Decimal('7.95'), }) for idx in range(7): assert result[SlotName('x')][DeviceId(f'a{idx}')] == Decimal('1.0') assert result[SlotName('x')][DeviceId('a7')] == Decimal('0.95') for idx in range(7): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('1.0') assert alloc_map.allocations[SlotName('x')][DeviceId('a7')] == Decimal( '0.95') with pytest.raises(InsufficientResource): alloc_map.allocate({ SlotName('x'): Decimal('1.0'), }) for idx in range(7): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('1.0') assert alloc_map.allocations[SlotName('x')][DeviceId('a7')] == Decimal( '0.95') alloc_map.free(result) for idx in range(8): assert alloc_map.allocations[SlotName('x')][DeviceId( f'a{idx}')] == Decimal('0')