예제 #1
0
 async def restore_from_container(
     self,
     container: Container,
     alloc_map: AbstractAllocMap,
 ) -> None:
     if not self.enabled:
         return
     resource_spec = await get_resource_spec_from_container(container.backend_obj)
     if resource_spec is None:
         return
     if hasattr(alloc_map, 'apply_allocation'):
         alloc_map.apply_allocation({
             SlotName('cuda.device'): resource_spec.allocations.get(
                 DeviceName('cuda'), {}
             ).get(
                 SlotName('cuda.device'), {}
             ),
         })
     else:
         alloc_map.allocations[SlotName('cuda.device')].update(
             resource_spec.allocations.get(
                 DeviceName('cuda'), {}
             ).get(
                 SlotName('cuda.device'), {}
             )
         )
예제 #2
0
def test_fraction_alloc_map_random_generated_allocations():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
        },
        allocation_strategy=FractionAllocationStrategy.FILL,
    )
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0')

    quantum = Decimal('.01')
    for _ in range(5):
        allocations = []
        for _ in range(10):
            result = alloc_map.allocate({
                SlotName('x'):
                Decimal(random.uniform(0, 0.1)).quantize(quantum, ROUND_DOWN),
            })
            allocations.append(result)
        assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] >= Decimal(
            '0')
        assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] >= Decimal(
            '0')
        for a in allocations:
            alloc_map.free(a)
        assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
            '0')
        assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
            '0')
예제 #3
0
 def check_clean():
     assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
         DeviceId('a0')] == Decimal('0')
     assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
         DeviceId('a1')] == Decimal('0')
     assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
         'a2')] == Decimal('0')
     assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
         'a3')] == Decimal('0')
     assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][
         DeviceId('a4')] == Decimal('0')
예제 #4
0
 async def restore_from_container(
     self,
     container: Container,
     alloc_map: AbstractAllocMap,
 ) -> None:
     assert isinstance(alloc_map, DiscretePropertyAllocMap)
     # Docker does not return the original cpuset.... :(
     # We need to read our own records.
     resource_spec = await get_resource_spec_from_container(
         container.backend_obj)
     if resource_spec is None:
         return
     alloc_map.apply_allocation({
         SlotName('cpu'):
         resource_spec.allocations[DeviceName('cpu')][SlotName('cpu')],
     })
예제 #5
0
 async def create_alloc_map(self) -> AbstractAllocMap:
     devices = await self.list_devices()
     return DiscretePropertyAllocMap(device_slots={
         dev.device_id: DeviceSlotInfo(SlotTypes.COUNT, SlotName('cpu'),
                                       Decimal(dev.processing_units))
         for dev in devices
     }, )
예제 #6
0
 async def create_alloc_map(self) -> AbstractAllocMap:
     devices = await self.list_devices()
     return DiscretePropertyAllocMap(device_slots={
         dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'),
                                       Decimal(dev.memory_size))
         for dev in devices
     }, )
예제 #7
0
 def read_from_string(cls, text: str) -> 'KernelResourceSpec':
     kvpairs = {}
     for line in text.split('\n'):
         if '=' not in line:
             continue
         key, val = line.strip().split('=', maxsplit=1)
         kvpairs[key] = val
     allocations = cast(
         MutableMapping[DeviceName, MutableMapping[SlotName,
                                                   Mapping[DeviceId,
                                                           Decimal]]],
         defaultdict(lambda: defaultdict(Decimal)),
     )
     for key, val in kvpairs.items():
         if key.endswith('_SHARES'):
             slot_name = SlotName(key[:-7].lower())
             device_name = DeviceName(slot_name.split('.')[0])
             per_device_alloc: MutableMapping[DeviceId, Decimal] = {}
             for entry in val.split(','):
                 raw_dev_id, _, raw_alloc = entry.partition(':')
                 if not raw_dev_id or not raw_alloc:
                     continue
                 dev_id = DeviceId(raw_dev_id)
                 try:
                     if known_slot_types.get(slot_name, 'count') == 'bytes':
                         alloc = Decimal(BinarySize.from_str(raw_alloc))
                     else:
                         alloc = Decimal(raw_alloc)
                 except KeyError as e:
                     log.warning(
                         'A previously launched container has '
                         'unknown slot type: {}. Ignoring it.', e.args[0])
                     continue
                 per_device_alloc[dev_id] = alloc
             allocations[device_name][slot_name] = per_device_alloc
     mounts = [Mount.from_str(m) for m in kvpairs['MOUNTS'].split(',') if m]
     return cls(
         container_id=kvpairs.get('CID', 'unknown'),
         scratch_disk_size=BinarySize.finite_from_str(
             kvpairs['SCRATCH_SIZE']),
         allocations=dict(allocations),
         slots=ResourceSlot(json.loads(kvpairs['SLOTS'])),
         mounts=mounts,
     )
예제 #8
0
 async def get_attached_devices(
     self,
     device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
 ) -> Sequence[DeviceModelInfo]:
     device_ids: List[DeviceId] = []
     if SlotName('cuda.devices') in device_alloc:
         device_ids.extend(device_alloc[SlotName('cuda.devices')].keys())
     available_devices = await self.list_devices()
     attached_devices: List[DeviceModelInfo] = []
     for device in available_devices:
         if device.device_id in device_ids:
             proc = device.processing_units
             mem = BinarySize(device.memory_size)
             attached_devices.append({  # TODO: update common.types.DeviceModelInfo
                 'device_id': device.device_id,
                 'model_name': device.model_name,
                 'smp': proc,
                 'mem': mem,
             })
     return attached_devices
예제 #9
0
 async def restore_from_container(
     self,
     container: Container,
     alloc_map: AbstractAllocMap,
 ) -> None:
     assert isinstance(alloc_map, DiscretePropertyAllocMap)
     memory_limit = container.backend_obj['HostConfig']['Memory']
     alloc_map.apply_allocation({
         SlotName('mem'): {
             DeviceId('root'): memory_limit
         },
     })
예제 #10
0
 async def get_attached_devices(
     self,
     device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
 ) -> Sequence[DeviceModelInfo]:
     device_ids = [*device_alloc[SlotName('mem')].keys()]
     available_devices = await self.list_devices()
     attached_devices: List[DeviceModelInfo] = []
     for device in available_devices:
         if device.device_id in device_ids:
             attached_devices.append({
                 'device_id': device.device_id,
                 'model_name': '',
                 'data': {},
             })
     return attached_devices
예제 #11
0
 async def resolve_occupied_slots(
         self, info: graphene.ResolveInfo) -> Mapping[str, Any]:
     """
     Calculate the sum of occupied resource slots of all sub-kernels,
     and return the JSON-serializable object from the sum result.
     """
     manager = info.context['dlmgr']
     loader = manager.get_loader('ComputeContainer.by_session')
     containers = await loader.load(self.session_id)
     zero = ResourceSlot()
     return sum(
         (ResourceSlot(
             {SlotName(k): Decimal(v)
              for k, v in c.occupied_slots.items()}) for c in containers),
         start=zero,
     ).to_json()
예제 #12
0
def test_exclusive_resource_slots():
    alloc_map = DiscretePropertyAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:1g.5gb-mig'),
                           Decimal(1)),  # noqa
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:1g.5gb-mig'),
                           Decimal(1)),  # noqa
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'),
                           Decimal(1)),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'),
                           Decimal(1)),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:3g.20gb-mig'),
                           Decimal(1)),  # noqa
        },
        exclusive_slot_types={
            'cuda.device:*-mig', 'cuda.device', 'cuda.shares'
        },
    )

    def check_clean():
        assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
            DeviceId('a0')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
            DeviceId('a1')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device')][DeviceId(
            'a2')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device')][DeviceId(
            'a3')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][
            DeviceId('a4')] == Decimal('0')

    with pytest.raises(InvalidResourceCombination):
        alloc_map.allocate({
            SlotName('cuda.device'): Decimal('2'),
            SlotName('cuda.device:1g.5gb-mig'): Decimal('1'),
        })
    check_clean()
예제 #13
0
def test_fraction_alloc_map_even_allocation_many_devices_2():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a5'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a6'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
            DeviceId('a7'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('1.0')),
        },
        allocation_strategy=FractionAllocationStrategy.EVENLY,
    )
    result = alloc_map.allocate({SlotName('x'): Decimal('6')})
    count_0 = 0
    count_1 = 0
    # NOTE: the even allocator favors the tail of device list when it fills up.
    # So we rely on the counting of desire per-device allocations instead of matching
    # the device index and the allocations.
    for idx in range(8):
        if alloc_map.allocations[SlotName('x')][DeviceId(
                f'a{idx}')] == Decimal('1.0'):
            count_1 += 1
        if alloc_map.allocations[SlotName('x')][DeviceId(
                f'a{idx}')] == Decimal('0'):
            count_0 += 1
    assert count_0 == 2
    assert count_1 == 6
    alloc_map.free(result)
    for idx in range(8):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')
예제 #14
0
def test_fraction_alloc_map_even_allocation_many_devices():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(2)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)),
        },
        allocation_strategy=FractionAllocationStrategy.EVENLY,
    )
    result = alloc_map.allocate({SlotName('x'): Decimal('6')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('3')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal('3')
    alloc_map.free(result)
    for idx in range(4):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.5)),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(2)),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(3)),
            DeviceId('a5'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(4)),
            DeviceId('a6'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(4.5)),
            DeviceId('a7'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)),
            DeviceId('a8'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(5)),
        },
        allocation_strategy=FractionAllocationStrategy.EVENLY,
    )

    result = alloc_map.allocate({SlotName('x'): Decimal('6')},
                                min_memory=Decimal('2.5'))
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal('3')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a4')] == Decimal('3')
    alloc_map.free(result)
    for idx in range(9):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('11')},
                                min_memory=Decimal('0.84'))
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '2.75')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a4')] == Decimal(
        '2.75')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a5')] == Decimal(
        '2.75')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a5')] == Decimal(
        '2.75')
    alloc_map.free(result)
    for idx in range(9):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')
예제 #15
0
def test_fraction_alloc_map_even_allocation_fractions():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.8')),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.75')),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.7')),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.0')),
        },
        allocation_strategy=FractionAllocationStrategy.EVENLY,
    )
    result = alloc_map.allocate({SlotName('x'): Decimal('2.31')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.67')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.67')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.67')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '0.3')
    alloc_map.free(result)
    for idx in range(4):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('2')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.67')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.67')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.66')
    alloc_map.free(result)
    for idx in range(3):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')
예제 #16
0
def test_fraction_alloc_map_even_allocation():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.05)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.1)),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.2)),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.3)),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(0.0)),
        },
        allocation_strategy=FractionAllocationStrategy.EVENLY,
    )
    for idx in range(5):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({
            SlotName('x'): Decimal('0.66'),
        })

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({
            SlotName('x'): Decimal('0.06'),
        },
                           min_memory=Decimal(0.6))
    for _ in range(20):
        alloc_map.allocate({
            SlotName('x'): Decimal('0.01'),
        })

    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.05')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.1')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.05')
    alloc_map.free({
        SlotName('x'): {
            DeviceId('a0'): Decimal('0.05'),
            DeviceId('a1'): Decimal('0.1'),
            DeviceId('a2'): Decimal('0.05')
        }
    })
    for idx in range(0):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('0.2')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.2')

    alloc_map.free(result)
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('0.2')},
                                min_memory=Decimal('0.25'))
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '0.2')
    alloc_map.free(result)
    for idx in range(5):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('0.5')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.2')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '0.3')
    alloc_map.free(result)
    for idx in range(5):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('0.65')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.05')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.1')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.2')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '0.3')
    alloc_map.free(result)
    for idx in range(5):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({SlotName('x'): Decimal('0.6')},
                                min_memory=Decimal('0.1'))
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.1')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.2')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a3')] == Decimal(
        '0.3')
    alloc_map.free(result)
    for idx in range(5):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    alloc_map = FractionAllocMap(device_slots={
        DeviceId('a0'):
        DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')),
        DeviceId('a1'):
        DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.3')),
        DeviceId('a2'):
        DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal('0.9')),
    }, )
    result = alloc_map.allocate({SlotName('x'): Decimal('1')})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.3')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.3')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a2')] == Decimal(
        '0.4')
예제 #17
0
class MemoryPlugin(AbstractComputePlugin):
    """
    Represents the main memory.

    When collecting statistics, it also measures network and I/O usage
    in addition to the memory usage.
    """

    config_watch_enabled = False

    key = DeviceName('mem')
    slot_types = [(SlotName('mem'), SlotTypes.BYTES)]

    async def init(self, context: Any = None) -> None:
        pass

    async def cleanup(self) -> None:
        pass

    async def update_plugin_config(
            self, new_plugin_config: Mapping[str, Any]) -> None:
        pass

    async def list_devices(self) -> Collection[MemoryDevice]:
        # TODO: support NUMA?
        memory_size = psutil.virtual_memory().total
        return [
            MemoryDevice(
                device_id=DeviceId('root'),
                hw_location='root',
                numa_node=0,
                memory_size=memory_size,
                processing_units=0,
            )
        ]

    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        devices = await self.list_devices()
        return {
            SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)),
        }

    def get_version(self) -> str:
        return __version__

    async def extra_info(self) -> Mapping[str, str]:
        return {}

    async def gather_node_measures(
            self, ctx: StatContext) -> Sequence[NodeMeasurement]:
        _mstat = psutil.virtual_memory()
        total_mem_used_bytes = Decimal(_mstat.total - _mstat.available)
        total_mem_capacity_bytes = Decimal(_mstat.total)
        _nstat = psutil.net_io_counters()
        net_rx_bytes = _nstat.bytes_recv
        net_tx_bytes = _nstat.bytes_sent

        def get_disk_stat():
            pruned_disk_types = frozenset(['squashfs', 'vfat', 'tmpfs'])
            total_disk_usage = Decimal(0)
            total_disk_capacity = Decimal(0)
            per_disk_stat = {}
            for disk_info in psutil.disk_partitions():
                if disk_info.fstype not in pruned_disk_types:
                    dstat = os.statvfs(disk_info.mountpoint)
                    disk_usage = Decimal(dstat.f_frsize *
                                         (dstat.f_blocks - dstat.f_bavail))
                    disk_capacity = Decimal(dstat.f_frsize * dstat.f_blocks)
                    per_disk_stat[disk_info.device] = Measurement(
                        disk_usage, disk_capacity)
                    total_disk_usage += disk_usage
                    total_disk_capacity += disk_capacity
            return total_disk_usage, total_disk_capacity, per_disk_stat

        loop = current_loop()
        total_disk_usage, total_disk_capacity, per_disk_stat = \
            await loop.run_in_executor(None, get_disk_stat)
        return [
            NodeMeasurement(
                MetricKey('mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_node=Measurement(total_mem_used_bytes,
                                     total_mem_capacity_bytes),
                per_device={
                    DeviceId('root'):
                    Measurement(total_mem_used_bytes, total_mem_capacity_bytes)
                },
            ),
            NodeMeasurement(
                MetricKey('disk'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                per_node=Measurement(total_disk_usage, total_disk_capacity),
                per_device=per_disk_stat,
            ),
            NodeMeasurement(
                MetricKey('net_rx'),
                MetricTypes.RATE,
                unit_hint='bps',
                current_hook=lambda metric: metric.stats.rate,
                per_node=Measurement(Decimal(net_rx_bytes)),
                per_device={
                    DeviceId('node'): Measurement(Decimal(net_rx_bytes))
                },
            ),
            NodeMeasurement(
                MetricKey('net_tx'),
                MetricTypes.RATE,
                unit_hint='bps',
                current_hook=lambda metric: metric.stats.rate,
                per_node=Measurement(Decimal(net_tx_bytes)),
                per_device={
                    DeviceId('node'): Measurement(Decimal(net_tx_bytes))
                },
            ),
        ]

    async def gather_container_measures(self, ctx: StatContext, container_ids: Sequence[str]) \
            -> Sequence[ContainerMeasurement]:
        def get_scratch_size(container_id: str) -> int:
            for kernel_id, info in ctx.agent.kernel_registry.items():
                if info['container_id'] == container_id:
                    break
            else:
                return 0
            work_dir = ctx.agent.local_config['container'][
                'scratch-root'] / str(kernel_id) / 'work'
            total_size = 0
            for path in work_dir.rglob('*'):
                if path.is_symlink():
                    total_size += path.lstat().st_size
                elif path.is_file():
                    total_size += path.stat().st_size
            return total_size

        async def sysfs_impl(container_id):
            mem_prefix = f'/sys/fs/cgroup/memory/docker/{container_id}/'
            io_prefix = f'/sys/fs/cgroup/blkio/docker/{container_id}/'
            try:
                mem_cur_bytes = read_sysfs(
                    mem_prefix + 'memory.usage_in_bytes', int)
                io_stats = Path(io_prefix +
                                'blkio.throttle.io_service_bytes').read_text()
                # example data:
                #   8:0 Read 13918208
                #   8:0 Write 0
                #   8:0 Sync 0
                #   8:0 Async 13918208
                #   8:0 Total 13918208
                #   Total 13918208
                io_read_bytes = 0
                io_write_bytes = 0
                for line in io_stats.splitlines():
                    if line.startswith('Total '):
                        continue
                    dev, op, nbytes = line.strip().split()
                    if op == 'Read':
                        io_read_bytes += int(nbytes)
                    elif op == 'Write':
                        io_write_bytes += int(nbytes)
            except IOError as e:
                log.warning(
                    'cannot read stats: sysfs unreadable for container {0}\n{1!r}',
                    container_id[:7], e)
                return None
            loop = current_loop()
            scratch_sz = await loop.run_in_executor(None, get_scratch_size,
                                                    container_id)
            return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz

        async def api_impl(container_id):
            container = DockerContainer(ctx.agent.docker, id=container_id)
            ret = await fetch_api_stats(container)
            if ret is None:
                return None
            mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0)
            io_read_bytes = 0
            io_write_bytes = 0
            for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive',
                              []):
                if item['op'] == 'Read':
                    io_read_bytes += item['value']
                elif item['op'] == 'Write':
                    io_write_bytes += item['value']
            loop = current_loop()
            scratch_sz = await loop.run_in_executor(None, get_scratch_size,
                                                    container_id)
            return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz

        if ctx.mode == StatModes.CGROUP:
            impl = sysfs_impl
        elif ctx.mode == StatModes.DOCKER:
            impl = api_impl
        else:
            raise RuntimeError("should not reach here")

        per_container_mem_used_bytes = {}
        per_container_io_read_bytes = {}
        per_container_io_write_bytes = {}
        per_container_io_scratch_size = {}
        tasks = []
        for cid in container_ids:
            tasks.append(asyncio.ensure_future(impl(cid)))
        results = await asyncio.gather(*tasks)
        for cid, result in zip(container_ids, results):
            if result is None:
                continue
            per_container_mem_used_bytes[cid] = Measurement(Decimal(result[0]))
            per_container_io_read_bytes[cid] = Measurement(Decimal(result[1]))
            per_container_io_write_bytes[cid] = Measurement(Decimal(result[2]))
            per_container_io_scratch_size[cid] = Measurement(Decimal(
                result[3]))
        return [
            ContainerMeasurement(
                MetricKey('mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_container=per_container_mem_used_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_read'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'rate'}),
                per_container=per_container_io_read_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_write'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'rate'}),
                per_container=per_container_io_write_bytes,
            ),
            ContainerMeasurement(
                MetricKey('io_scratch_size'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_container=per_container_io_scratch_size,
            ),
        ]

    async def create_alloc_map(self) -> AbstractAllocMap:
        devices = await self.list_devices()
        return DiscretePropertyAllocMap(device_slots={
            dev.device_id: DeviceSlotInfo(SlotTypes.BYTES, SlotName('mem'),
                                          Decimal(dev.memory_size))
            for dev in devices
        }, )

    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        return []

    async def generate_docker_args(
        self,
        docker: Docker,
        device_alloc,
    ) -> Mapping[str, Any]:
        memory = sum(device_alloc['mem'].values())
        return {
            'HostConfig': {
                'MemorySwap': int(memory),  # prevent using swap!
                'Memory': int(memory),
            }
        }

    async def restore_from_container(
        self,
        container: Container,
        alloc_map: AbstractAllocMap,
    ) -> None:
        assert isinstance(alloc_map, DiscretePropertyAllocMap)
        memory_limit = container.backend_obj['HostConfig']['Memory']
        alloc_map.apply_allocation({
            SlotName('mem'): {
                DeviceId('root'): memory_limit
            },
        })

    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        device_ids = [*device_alloc[SlotName('mem')].keys()]
        available_devices = await self.list_devices()
        attached_devices: List[DeviceModelInfo] = []
        for device in available_devices:
            if device.device_id in device_ids:
                attached_devices.append({
                    'device_id': device.device_id,
                    'model_name': '',
                    'data': {},
                })
        return attached_devices
예제 #18
0
"""
Common definitions/constants used throughout the manager.
"""

from typing import Final

from ai.backend.common.types import SlotName, SlotTypes


INTRINSIC_SLOTS: Final = {
    SlotName('cpu'): SlotTypes('count'),
    SlotName('mem'): SlotTypes('bytes'),
}
예제 #19
0
 async def available_slots(self) -> Mapping[SlotName, Decimal]:
     devices = await self.list_devices()
     return {
         SlotName('mem'): Decimal(sum(dev.memory_size for dev in devices)),
     }
예제 #20
0
 async def available_slots(self) -> Mapping[SlotName, Decimal]:
     devices = await self.list_devices()
     return {
         SlotName('cuda.device'): Decimal(len(devices)),
     }
예제 #21
0
 async def _get_resource_slots(self):
     raw_data = await self.etcd.get_prefix_dict('config/resource_slots')
     return {SlotName(k): SlotTypes(v) for k, v in raw_data.items()}
예제 #22
0
def test_quantum_size(alloc_strategy):
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)),  # noqa
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1)),  # noqa
        },
        quantum_size=Decimal("0.25"),
        allocation_strategy=alloc_strategy,
    )
    result = alloc_map.allocate({
        SlotName('x'): Decimal("0.5"),
    })
    assert sum(alloc_map.allocations[SlotName('x')].values()) == Decimal("0.5")
    alloc_map.free(result)

    result = alloc_map.allocate({
        SlotName('x'): Decimal("1.5"),
    })
    assert sum(alloc_map.allocations[SlotName('x')].values()) == Decimal("1.5")
    if alloc_strategy == FractionAllocationStrategy.EVENLY:
        assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
            "0.75")
        assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
            "0.75")
    else:
        assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
            "1.00")
        assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
            "0.50")
    alloc_map.free(result)

    # inputs are not multiple of 0.25
    with pytest.raises(NotMultipleOfQuantum):
        alloc_map.allocate({
            SlotName('x'): Decimal("0.52"),
        })
    with pytest.raises(NotMultipleOfQuantum):
        alloc_map.allocate({
            SlotName('x'): Decimal("0.42"),
        })
    with pytest.raises(NotMultipleOfQuantum):
        alloc_map.allocate({
            SlotName('x'): Decimal("3.99"),
        })

    if alloc_strategy == FractionAllocationStrategy.EVENLY:
        # input IS multiple of 0.25 but the CALCULATED allocations are not multiple of 0.25
        with pytest.raises(InsufficientResource, match="multiple-of-quantum"):
            alloc_map.allocate({
                SlotName('x'): Decimal("1.75"),  # divided to 0.88 and 0.87
            })
    else:
        # In this case, it satisfies the quantum condition, because the capacity of devices are
        # multiples of the quantum.
        alloc_map.allocate({
            SlotName('x'): Decimal("1.75"),
        })
        assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
            "1.00")
        assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
            "0.75")

        # So let's change the situation.
        alloc_map = FractionAllocMap(
            device_slots={
                DeviceId('a0'):
                DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'),
                               Decimal(1)),  # noqa
                DeviceId('a1'):
                DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'),
                               Decimal(1)),  # noqa
            },
            quantum_size=Decimal("0.3"),
            allocation_strategy=alloc_strategy,
        )
        with pytest.raises(NotMultipleOfQuantum):
            alloc_map.allocate({
                SlotName('x'): Decimal("0.5"),
            })
        with pytest.raises(InsufficientResource, match="multiple-of-quantum"):
            alloc_map.allocate({
                SlotName('x'): Decimal("1.2"),
            })
예제 #23
0
def test_discrete_alloc_map_large_number():
    alloc_map = DiscretePropertyAllocMap(device_slots={
        DeviceId('a0'):
        DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(100)),
        DeviceId('a1'):
        DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(100)),
    }, )
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 0
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 0

    result = alloc_map.allocate({
        SlotName('x'): Decimal('130'),
    })
    assert result[SlotName('x')][DeviceId('a0')] == 100
    assert result[SlotName('x')][DeviceId('a1')] == 30
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 100
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 30

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({
            SlotName('x'): Decimal('71'),
        })
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 100
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 30

    alloc_map.free(result)
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == 0
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == 0
예제 #24
0
 async def available_slots(self) -> Mapping[SlotName, Decimal]:
     devices = await self.list_devices()
     return {
         SlotName('cpu'):
         Decimal(sum(dev.processing_units for dev in devices)),
     }
예제 #25
0
def test_heterogeneous_resource_slots_with_fractional_alloc_map():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:1g.5gb-mig'),
                           Decimal(1)),  # noqa
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:1g.5gb-mig'),
                           Decimal(1)),  # noqa
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.shares'),
                           Decimal('1.0')),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.shares'),
                           Decimal('1.0')),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.UNIQUE,
                           SlotName('cuda.device:3g.20gb-mig'),
                           Decimal(1)),  # noqa
        },
        exclusive_slot_types={
            'cuda.device:*-mig', 'cuda.device', 'cuda.shares'
        },
        allocation_strategy=FractionAllocationStrategy.FILL,
    )

    def check_clean():
        assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
            DeviceId('a0')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][
            DeviceId('a1')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
            'a2')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
            'a3')] == Decimal('0')
        assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][
            DeviceId('a4')] == Decimal('0')

    check_clean()

    # check allocation of non-unique slots
    result = alloc_map.allocate({SlotName('cuda.shares'): Decimal('2.0')})
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a1')] == Decimal('0')
    assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
        'a2')] == Decimal('1.0')
    assert alloc_map.allocations[SlotName('cuda.shares')][DeviceId(
        'a3')] == Decimal('1.0')
    assert alloc_map.allocations[SlotName('cuda.device:3g.20gb-mig')][DeviceId(
        'a4')] == Decimal('0')
    alloc_map.free(result)
    check_clean()

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({SlotName('cuda.shares'): Decimal('2.5')})
    check_clean()

    # allocating zero means no-op.
    alloc_map.allocate({SlotName('cuda.device:1g.5gb-mig'): Decimal('0')})
    check_clean()

    # any allocation request for unique slots should specify the amount 1.
    with pytest.raises(InvalidResourceArgument):
        alloc_map.allocate(
            {SlotName('cuda.device:1g.5gb-mig'): Decimal('0.3')})
    with pytest.raises(InvalidResourceArgument):
        alloc_map.allocate(
            {SlotName('cuda.device:1g.5gb-mig'): Decimal('1.5')})
    check_clean()

    # test alloaction of unique slots
    result1 = alloc_map.allocate(
        {SlotName('cuda.device:1g.5gb-mig'): Decimal('1')})
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a0')] == Decimal('1')
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a1')] == Decimal('0')
    result2 = alloc_map.allocate(
        {SlotName('cuda.device:1g.5gb-mig'): Decimal('1')})
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a0')] == Decimal('1')
    assert alloc_map.allocations[SlotName('cuda.device:1g.5gb-mig')][DeviceId(
        'a1')] == Decimal('1')
    with pytest.raises(InsufficientResource):
        alloc_map.allocate({SlotName('cuda.device:1g.5gb-mig'): Decimal('1')})
    alloc_map.free(result1)
    alloc_map.free(result2)
    check_clean()
예제 #26
0
def test_fraction_alloc_map_iteration():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
        },
        allocation_strategy=FractionAllocationStrategy.FILL,
        quantum_size=Decimal("0.00001"))
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0')

    for _ in range(1000):
        alloc_map.allocate({
            SlotName('x'): Decimal('0.00001'),
        })
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.005')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.005')

    alloc_map.free({SlotName('x'): {DeviceId('a0'): Decimal('0.00001')}})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '0.00499')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.005')

    for _ in range(499):
        alloc_map.free({SlotName('x'): {DeviceId('a0'): Decimal('0.00001')}})
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.005')
예제 #27
0
def test_fraction_alloc_map():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
        },
        allocation_strategy=FractionAllocationStrategy.FILL,
    )
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0')

    result = alloc_map.allocate({
        SlotName('x'): Decimal('1.5'),
    })
    assert result[SlotName('x')][DeviceId('a0')] == Decimal('1.0')
    assert result[SlotName('x')][DeviceId('a1')] == Decimal('0.5')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '1.0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.5')

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({
            SlotName('x'): Decimal('1.5'),
        })
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal(
        '1.0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal(
        '0.5')

    alloc_map.free(result)
    assert alloc_map.allocations[SlotName('x')][DeviceId('a0')] == Decimal('0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a1')] == Decimal('0')
예제 #28
0
class CUDAPlugin(AbstractComputePlugin):

    config_watch_enabled = False

    key = DeviceName('cuda')
    slot_types: Sequence[Tuple[SlotName, SlotTypes]] = (
        (SlotName('cuda.device'), SlotTypes('count')),
    )

    nvdocker_version: Tuple[int, ...] = (0, 0, 0)
    docker_version: Tuple[int, ...] = (0, 0, 0)

    device_mask: Sequence[DeviceId] = []
    enabled: bool = True

    async def init(self, context: Any = None) -> None:
        rx_triple_version = re.compile(r'(\d+\.\d+\.\d+)')
        # Check nvidia-docker and docker versions
        try:
            proc = await asyncio.create_subprocess_exec(
                'nvidia-docker', 'version', '-f', '{{json .}}',
                stdout=asyncio.subprocess.PIPE,
            )
            stdout, _ = await proc.communicate()
            lines = stdout.decode().splitlines()
        except FileNotFoundError:
            log.warning('nvidia-docker is not installed.')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return
        m = rx_triple_version.search(lines[0])
        if m:
            self.nvdocker_version = tuple(map(int, m.group(1).split('.')))
        else:
            log.error('could not detect nvidia-docker version!')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return
        docker_version_data = json.loads(lines[1])
        m = rx_triple_version.search(docker_version_data['Server']['Version'])
        if m:
            self.docker_version = tuple(map(int, m.group(1).split('.')))
        else:
            log.error('could not detect docker version!')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
            return

        raw_device_mask = self.plugin_config.get('device_mask')
        if raw_device_mask is not None:
            self.device_mask = [
                *map(lambda dev_id: DeviceId(dev_id), raw_device_mask.split(','))
            ]
        try:
            detected_devices = await self.list_devices()
            log.info('detected devices:\n' + pformat(detected_devices))
            log.info('nvidia-docker version: {}', self.nvdocker_version)
            log.info('CUDA acceleration is enabled.')
        except ImportError:
            log.warning('could not load the CUDA runtime library.')
            log.info('CUDA acceleration is disabled.')
            self.enabled = False
        except RuntimeError as e:
            log.warning('CUDA init error: {}', e)
            log.info('CUDA acceleration is disabled.')
            self.enabled = False

    async def cleanup(self) -> None:
        pass

    async def update_plugin_config(
        self,
        new_plugin_config: Mapping[str, Any],
    ) -> None:
        pass

    async def list_devices(self) -> Collection[CUDADevice]:
        if not self.enabled:
            return []
        all_devices = []
        num_devices = libcudart.get_device_count()
        for dev_id in map(lambda idx: DeviceId(str(idx)), range(num_devices)):
            if dev_id in self.device_mask:
                continue
            raw_info = libcudart.get_device_props(int(dev_id))
            sysfs_node_path = "/sys/bus/pci/devices/" \
                              f"{raw_info['pciBusID_str'].lower()}/numa_node"
            node: Optional[int]
            try:
                node = int(Path(sysfs_node_path).read_text().strip())
            except OSError:
                node = None
            dev_uuid, raw_dev_uuid = None, raw_info.get('uuid', None)
            if raw_dev_uuid is not None:
                dev_uuid = str(uuid.UUID(bytes=raw_dev_uuid))
            else:
                dev_uuid = '00000000-0000-0000-0000-000000000000'
            dev_info = CUDADevice(
                device_id=dev_id,
                hw_location=raw_info['pciBusID_str'],
                numa_node=node,
                memory_size=raw_info['totalGlobalMem'],
                processing_units=raw_info['multiProcessorCount'],
                model_name=raw_info['name'],
                uuid=dev_uuid,
            )
            all_devices.append(dev_info)
        return all_devices

    async def available_slots(self) -> Mapping[SlotName, Decimal]:
        devices = await self.list_devices()
        return {
            SlotName('cuda.device'): Decimal(len(devices)),
        }

    def get_version(self) -> str:
        return __version__

    async def extra_info(self) -> Mapping[str, Any]:
        if self.enabled:
            try:
                return {
                    'cuda_support': True,
                    'nvidia_version': libnvml.get_driver_version(),
                    'cuda_version': '{0[0]}.{0[1]}'.format(libcudart.get_version()),
                }
            except ImportError:
                log.warning('extra_info(): NVML/CUDA runtime library is not found')
            except LibraryError as e:
                log.warning('extra_info(): {!r}', e)
        return {
            'cuda_support': False,
        }

    async def gather_node_measures(
        self,
        ctx: StatContext,
    ) -> Sequence[NodeMeasurement]:
        dev_count = 0
        mem_avail_total = 0
        mem_used_total = 0
        mem_stats = {}
        util_total = 0
        util_stats = {}
        if self.enabled:
            try:
                dev_count = libnvml.get_device_count()
                for dev_id in map(lambda idx: DeviceId(str(idx)), range(dev_count)):
                    if dev_id in self.device_mask:
                        continue
                    dev_stat = libnvml.get_device_stats(int(dev_id))
                    mem_avail_total += dev_stat.mem_total
                    mem_used_total += dev_stat.mem_used
                    mem_stats[dev_id] = Measurement(Decimal(dev_stat.mem_used),
                                                    Decimal(dev_stat.mem_total))
                    util_total += dev_stat.gpu_util
                    util_stats[dev_id] = Measurement(Decimal(dev_stat.gpu_util), Decimal(100))
            except ImportError:
                log.warning('gather_node_measure(): NVML library is not found')
            except LibraryError as e:
                log.warning('gather_node_measure(): {!r}', e)
        return [
            NodeMeasurement(
                MetricKey('cuda_mem'),
                MetricTypes.USAGE,
                unit_hint='bytes',
                stats_filter=frozenset({'max'}),
                per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)),
                per_device=mem_stats,
            ),
            NodeMeasurement(
                MetricKey('cuda_util'),
                MetricTypes.USAGE,
                unit_hint='percent',
                stats_filter=frozenset({'avg', 'max'}),
                per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)),
                per_device=util_stats,
            ),
        ]

    async def gather_container_measures(
            self, ctx: StatContext,
            container_ids: Sequence[str],
            ) -> Sequence[ContainerMeasurement]:
        return []

    async def create_alloc_map(self) -> AbstractAllocMap:
        devices = await self.list_devices()
        return DiscretePropertyAllocMap(
            device_slots={
                dev.device_id: (
                    DeviceSlotInfo(SlotTypes.COUNT, SlotName('cuda.device'), Decimal(1))
                ) for dev in devices
            },
        )

    async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
        return []

    async def generate_docker_args(
        self,
        docker: aiodocker.Docker,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Mapping[str, Any]:
        if not self.enabled:
            return {}
        assigned_device_ids = []
        for slot_type, per_device_alloc in device_alloc.items():
            for device_id, alloc in per_device_alloc.items():
                if alloc > 0:
                    assigned_device_ids.append(device_id)
        if self.nvdocker_version[0] == 1:
            timeout = aiohttp.ClientTimeout(total=3)
            async with aiohttp.ClientSession(raise_for_status=True,
                                             timeout=timeout) as sess:
                try:
                    nvdocker_url = 'http://localhost:3476/docker/cli/json'
                    async with sess.get(nvdocker_url) as resp:
                        nvidia_params = await resp.json()
                except aiohttp.ClientError:
                    raise RuntimeError('NVIDIA Docker plugin is not available.')

            volumes = await docker.volumes.list()
            existing_volumes = set(vol['Name'] for vol in volumes['Volumes'])
            required_volumes = set(vol.split(':')[0]
                                   for vol in nvidia_params['Volumes'])
            missing_volumes = required_volumes - existing_volumes
            binds = []
            for vol_name in missing_volumes:
                for vol_param in nvidia_params['Volumes']:
                    if vol_param.startswith(vol_name + ':'):
                        _, _, permission = vol_param.split(':')
                        driver = nvidia_params['VolumeDriver']
                        await docker.volumes.create({
                            'Name': vol_name,
                            'Driver': driver,
                        })
            for vol_name in required_volumes:
                for vol_param in nvidia_params['Volumes']:
                    if vol_param.startswith(vol_name + ':'):
                        _, mount_pt, permission = vol_param.split(':')
                        binds.append('{}:{}:{}'.format(
                            vol_name, mount_pt, permission))
            devices = []
            for dev in nvidia_params['Devices']:
                m = re.search(r'^/dev/nvidia(\d+)$', dev)
                if m is None:
                    # Always add non-GPU device files required by the driver.
                    # (e.g., nvidiactl, nvidia-uvm, ... etc.)
                    devices.append(dev)
                    continue
                device_id = m.group(1)
                if device_id not in assigned_device_ids:
                    continue
                devices.append(dev)
            devices = [{
                'PathOnHost': dev,
                'PathInContainer': dev,
                'CgroupPermissions': 'mrw',
            } for dev in devices]
            return {
                'HostConfig': {
                    'Binds': binds,
                    'Devices': devices,
                },
            }
        elif self.nvdocker_version[0] == 2:
            device_list_str = ','.join(sorted(assigned_device_ids))
            if self.docker_version >= (19, 3, 0):
                docker_config: Dict[str, Any] = {}
                if assigned_device_ids:
                    docker_config.update({
                        'HostConfig': {
                            'DeviceRequests': [
                                {
                                    "Driver": "nvidia",
                                    "DeviceIDs": assigned_device_ids,
                                    # "all" does not work here
                                    "Capabilities": [
                                        ["utility", "compute", "video", "graphics", "display"]
                                    ],
                                },
                            ],
                        },
                    })
                return docker_config
            else:
                return {
                    'HostConfig': {
                        'Runtime': 'nvidia',
                    },
                    'Env': [
                        f"NVIDIA_VISIBLE_DEVICES={device_list_str}",
                    ],
                }
        else:
            raise RuntimeError('BUG: should not be reached here!')

    async def get_attached_devices(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Sequence[DeviceModelInfo]:
        device_ids: List[DeviceId] = []
        if SlotName('cuda.devices') in device_alloc:
            device_ids.extend(device_alloc[SlotName('cuda.devices')].keys())
        available_devices = await self.list_devices()
        attached_devices: List[DeviceModelInfo] = []
        for device in available_devices:
            if device.device_id in device_ids:
                proc = device.processing_units
                mem = BinarySize(device.memory_size)
                attached_devices.append({  # TODO: update common.types.DeviceModelInfo
                    'device_id': device.device_id,
                    'model_name': device.model_name,
                    'smp': proc,
                    'mem': mem,
                })
        return attached_devices

    async def restore_from_container(
        self,
        container: Container,
        alloc_map: AbstractAllocMap,
    ) -> None:
        if not self.enabled:
            return
        resource_spec = await get_resource_spec_from_container(container.backend_obj)
        if resource_spec is None:
            return
        if hasattr(alloc_map, 'apply_allocation'):
            alloc_map.apply_allocation({
                SlotName('cuda.device'): resource_spec.allocations.get(
                    DeviceName('cuda'), {}
                ).get(
                    SlotName('cuda.device'), {}
                ),
            })
        else:
            alloc_map.allocations[SlotName('cuda.device')].update(
                resource_spec.allocations.get(
                    DeviceName('cuda'), {}
                ).get(
                    SlotName('cuda.device'), {}
                )
            )

    async def generate_resource_data(
        self,
        device_alloc: Mapping[SlotName, Mapping[DeviceId, Decimal]],
    ) -> Mapping[str, str]:
        data: MutableMapping[str, str] = {}
        if not self.enabled:
            return data

        active_device_id_set: Set[DeviceId] = set()
        for slot_type, per_device_alloc in device_alloc.items():
            for dev_id, alloc in per_device_alloc.items():
                if alloc > 0:
                    active_device_id_set.add(dev_id)
        active_device_ids = sorted(active_device_id_set, key=lambda v: int(v))
        data['CUDA_GLOBAL_DEVICE_IDS'] = ','.join(
            f'{local_idx}:{global_id}'
            for local_idx, global_id in enumerate(active_device_ids))
        return data
예제 #29
0
async def detect_resources(
    etcd: AsyncEtcd,
    local_config: Mapping[str, Any],
) -> Tuple[Mapping[DeviceName, AbstractComputePlugin], Mapping[SlotName,
                                                               Decimal]]:
    """
    Detect available computing resource of the system.
    It also loads the accelerator plugins.

    limit_cpus, limit_gpus are deprecated.
    """
    reserved_slots = {
        'cpu': local_config['resource']['reserved-cpu'],
        'mem': local_config['resource']['reserved-mem'],
        'disk': local_config['resource']['reserved-disk'],
    }
    slots: MutableMapping[SlotName, Decimal] = {}

    compute_device_types: MutableMapping[DeviceName,
                                         AbstractComputePlugin] = {}

    # Initialize intrinsic plugins by ourselves.
    from .intrinsic import CPUPlugin, MemoryPlugin
    compute_plugin_ctx = ComputePluginContext(
        etcd,
        local_config,
    )
    await compute_plugin_ctx.init()
    if 'cpu' not in compute_plugin_ctx.plugins:
        cpu_config = await etcd.get_prefix('config/plugins/cpu')
        cpu_plugin = CPUPlugin(cpu_config, local_config)
        compute_plugin_ctx.attach_intrinsic_device(cpu_plugin)
    if 'mem' not in compute_plugin_ctx.plugins:
        memory_config = await etcd.get_prefix('config/plugins/memory')
        memory_plugin = MemoryPlugin(memory_config, local_config)
        compute_plugin_ctx.attach_intrinsic_device(memory_plugin)
    for plugin_name, plugin_instance in compute_plugin_ctx.plugins.items():
        if not all((invalid_name := sname,
                    sname.startswith(f'{plugin_instance.key}.'))[1]
                   for sname, _ in plugin_instance.slot_types
                   if sname not in {'cpu', 'mem'}):
            raise InitializationError(
                "Slot types defined by an accelerator plugin must be prefixed "
                "by the plugin's key.",
                invalid_name,  # noqa: F821
                plugin_instance.key,
            )
        if plugin_instance.key in compute_device_types:
            raise InitializationError(
                f"A plugin defining the same key '{plugin_instance.key}' already exists. "
                "You may need to uninstall it first.")
        compute_device_types[plugin_instance.key] = plugin_instance

    for key, computer in compute_device_types.items():
        known_slot_types.update(
            computer.slot_types)  # type: ignore  # (only updated here!)
        resource_slots = await computer.available_slots()
        for sname, sval in resource_slots.items():
            slots[sname] = Decimal(max(0, sval - reserved_slots.get(sname, 0)))
            if slots[sname] <= 0 and sname in (SlotName('cpu'),
                                               SlotName('mem')):
                raise InitializationError(
                    f"The resource slot '{sname}' is not sufficient (zero or below zero). "
                    "Try to adjust the reserved resources or use a larger machine."
                )

    log.info('Resource slots: {!r}', slots)
    log.info('Slot types: {!r}', known_slot_types)
    return compute_device_types, slots
예제 #30
0
def test_fraction_alloc_map_many_device():
    alloc_map = FractionAllocMap(
        device_slots={
            DeviceId('a0'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a1'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a2'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a3'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a4'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a5'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a6'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
            DeviceId('a7'):
            DeviceSlotInfo(SlotTypes.COUNT, SlotName('x'), Decimal(1.0)),
        },
        allocation_strategy=FractionAllocationStrategy.FILL,
    )
    for idx in range(8):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')

    result = alloc_map.allocate({
        SlotName('x'): Decimal('7.95'),
    })
    for idx in range(7):
        assert result[SlotName('x')][DeviceId(f'a{idx}')] == Decimal('1.0')
    assert result[SlotName('x')][DeviceId('a7')] == Decimal('0.95')
    for idx in range(7):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('1.0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a7')] == Decimal(
        '0.95')

    with pytest.raises(InsufficientResource):
        alloc_map.allocate({
            SlotName('x'): Decimal('1.0'),
        })
    for idx in range(7):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('1.0')
    assert alloc_map.allocations[SlotName('x')][DeviceId('a7')] == Decimal(
        '0.95')

    alloc_map.free(result)
    for idx in range(8):
        assert alloc_map.allocations[SlotName('x')][DeviceId(
            f'a{idx}')] == Decimal('0')