def destroy(self): """ Destroy the DCGMMonitor. This function must be called in order to appropriately deallocate the resources. """ dcgm_agent.dcgmShutdown() super().destroy()
def destroy(self): """ Destroy the DCGMMonitor. This function must be called in order to appropriately deallocate the resources. """ dcgm_agent.dcgmShutdown() self._thread_pool.terminate() self._thread_pool.close()
def __init__(self, gpus, frequency, metrics, dcgmPath=None): """ Parameters ---------- gpus : list of GPUDevice The gpus to be monitored frequency : int Sampling frequency for the metric metrics : list List of Record types to monitor dcgmPath : str (optional) DCGM installation path """ super().__init__(frequency, metrics) structs._dcgmInit(dcgmPath) dcgm_agent.dcgmInit() self._gpus = gpus # Start DCGM in the embedded mode to use the shared library self.dcgm_handle = dcgm_handle = dcgm_agent.dcgmStartEmbedded( structs.DCGM_OPERATION_MODE_MANUAL) # Create DCGM monitor group self.group_id = dcgm_agent.dcgmGroupCreate(dcgm_handle, structs.DCGM_GROUP_EMPTY, "triton-monitor") # Add the GPUs to the group for gpu in self._gpus: dcgm_agent.dcgmGroupAddDevice(dcgm_handle, self.group_id, gpu.device_id()) frequency = int(self._frequency * 1000) fields = [] try: for metric in metrics: fields.append(self.model_analyzer_to_dcgm_field[metric]) except KeyError: dcgm_agent.dcgmShutdown() raise TritonModelAnalyzerException( f'{metric} is not supported by Model Analyzer DCGM Monitor') self.dcgm_field_group_id = dcgm_agent.dcgmFieldGroupCreate( dcgm_handle, fields, 'triton-monitor') self.group_watcher = dcgm_field_helpers.DcgmFieldGroupWatcher( dcgm_handle, self.group_id, self.dcgm_field_group_id.value, structs.DCGM_OPERATION_MODE_MANUAL, frequency, 3600, 0, 0)
def create_device_by_uuid(uuid, dcgmPath=None): """ Create a GPU device using the GPU uuid. Parameters ---------- uuid : str index of the device in the list of visible CUDA devices. Returns ------- Device The device associated with the uuid. Raises ------ TritonModelAnalyzerExcpetion If the uuid does not exist this exception will be raised. """ structs._dcgmInit(dcgmPath) dcgm_agent.dcgmInit() # Start DCGM in the embedded mode to use the shared library dcgm_handle = dcgm_agent.dcgmStartEmbedded( structs.DCGM_OPERATION_MODE_MANUAL) gpu_devices = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle) for gpu_device in gpu_devices: device_atrributes = dcgm_agent.dcgmGetDeviceAttributes( dcgm_handle, gpu_device).identifiers pci_bus_id = bytes( device_atrributes.pciBusId.decode('ascii').upper(), encoding='ascii') device_uuid = device_atrributes.uuid if bytes(uuid, encoding='ascii') == device_uuid: gpu_device = GPUDevice(gpu_device, pci_bus_id, device_uuid) dcgm_agent.dcgmShutdown() return gpu_device else: dcgm_agent.dcgmShutdown() raise TritonModelAnalyzerException( f'GPU UUID {uuid} was not found.')
def create_device_by_bus_id(bus_id, dcgmPath=None): """ Create a GPU device by using its bus ID. Parameters ---------- bus_id : bytes Bus id corresponding to the GPU. The bus id should be created by converting the colon separated hex notation into a bytes type using ascii encoding. The bus id before conversion to bytes should look like "00:65:00". Returns ------- Device The device associated with this bus id. """ structs._dcgmInit(dcgmPath) dcgm_agent.dcgmInit() # Start DCGM in the embedded mode to use the shared library dcgm_handle = dcgm_agent.dcgmStartEmbedded( structs.DCGM_OPERATION_MODE_MANUAL) gpu_devices = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle) for gpu_device in gpu_devices: device_atrributes = dcgm_agent.dcgmGetDeviceAttributes( dcgm_handle, gpu_device).identifiers pci_bus_id = bytes( device_atrributes.pciBusId.decode('ascii').upper(), encoding='ascii') device_uuid = device_atrributes.uuid if pci_bus_id == bus_id: gpu_device = GPUDevice(gpu_device, bus_id, device_uuid) dcgm_agent.dcgmShutdown() return gpu_device else: dcgm_agent.dcgmShutdown() raise TritonModelAnalyzerException( f'GPU with {bus_id} bus id is not supported by DCGM.')
def init_all_devices(self, dcgmPath=None): """ Create GPUDevice objects for all DCGM visible devices. Parameters ---------- dcgmPath : str Absolute path to dcgm shared library """ if numba.cuda.is_available(): logger.info("Initiliazing GPUDevice handles...") structs._dcgmInit(dcgmPath) dcgm_agent.dcgmInit() # Start DCGM in the embedded mode to use the shared library dcgm_handle = dcgm_agent.dcgmStartEmbedded( structs.DCGM_OPERATION_MODE_MANUAL) # Create a GPU device for every supported DCGM device dcgm_device_ids = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle) for device_id in dcgm_device_ids: device_atrributes = dcgm_agent.dcgmGetDeviceAttributes( dcgm_handle, device_id).identifiers pci_bus_id = device_atrributes.pciBusId.decode('utf-8').upper() device_uuid = str(device_atrributes.uuid, encoding='utf-8') device_name = str(device_atrributes.deviceName, encoding='utf-8') gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid) self._devices.append(gpu_device) self._devices_by_bus_id[pci_bus_id] = gpu_device self._devices_by_uuid[device_uuid] = gpu_device dcgm_agent.dcgmShutdown()