def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def get_device_procs(self, device_id: int) -> Optional[List[ProcInfo]]: """ List processes running on the GPU. Parameters ---------- device_id : int Device identifier Returns ------- Optional[List[ProcInfo]] List of ProcInfo named tuples (name, pid, mem fields) Raises ------ RuntimeError In case of py3nvml failure. """ py3nvml.nvmlInit() dev_count = py3nvml.nvmlDeviceGetCount() # type: int if not (0 <= device_id < dev_count): raise RuntimeError('Failed to query GPU with nvml') handle = py3nvml.nvmlDeviceGetHandleByIndex(device_id) result = [] try: for proc in py3nvml.nvmlDeviceGetComputeRunningProcesses(handle): try: name = str(py3nvml.nvmlSystemGetProcessName(proc.pid)) except py3nvml.NVMLError as err: if (err.value == py3nvml.NVML_ERROR_NOT_FOUND): # exited? continue raise mem = proc.usedGpuMemory / 1024 / 1024 result.append(ProcInfo(name, proc.pid, mem)) finally: py3nvml.nvmlShutdown() return result
def update(self): self.processes = [ Process(p) for p in py3nvml.nvmlDeviceGetComputeRunningProcesses(self.handle) ]
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def getGpuInfo(self): if (self._impulse % 2) != 0: return self._gpuInfoObj try: N.nvmlInit() gpuInfoObj = {} driverVersion = N.nvmlSystemGetDriverVersion() deviceCnt = N.nvmlDeviceGetCount() gpuInfoObj['DRIVER_VERSION'] = driverVersion gpuInfoObj['DEVICE_COUNT'] = deviceCnt for dCnt in range(deviceCnt): deviceInfoObj = {} handle = N.nvmlDeviceGetHandleByIndex(dCnt) name = N.nvmlDeviceGetName(handle) try: fan = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as err: fan = 'N/A' try: temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError as err: temp = 'N/A' try: powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000) except N.NVMLError as err: powerUsage = 'N/A' try: powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000) except N.NVMLError as err: powerLimit = 'N/A' try: memInfo = N.nvmlDeviceGetMemoryInfo(handle) memUsage = round(memInfo.used/1024/1024) memTotal = round(memInfo.total/1024/1024) except N.NVMLError as err: memUsage = 'N/A' memTotal = 'N/A' try: util = N.nvmlDeviceGetUtilizationRates(handle).gpu except N.NVMLError as err: util = 'N/A' deviceInfoObj['NAME'] = name deviceInfoObj['FAN'] = fan deviceInfoObj['TEMP'] = temp deviceInfoObj['POWER_USAGE'] = powerUsage deviceInfoObj['POWER_LIMIT'] = powerLimit deviceInfoObj['MEM_USAGE'] = memUsage deviceInfoObj['MEM_TOTAL'] = memTotal deviceInfoObj['UTIL'] = util gpuProcessObj = {} try: processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError as err: processes = [] for pCnt, process in enumerate(processes): gpuMem = round(process.usedGpuMemory / 1024 / 1024) pid = process.pid try: p = psutil.Process(pid) attrs = p.as_dict(attrs = ['name', 'username', 'status']) except psutil.ZombieProcess: attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'} except: pass gpuProcessObj[str(pCnt)] = { 'PID': pid, 'MEM': gpuMem, 'NAME': attrs['name'], 'USERNAME': self._getSubuidName(attrs['username']), 'STATUS': attrs['status'] } deviceInfoObj['PROCESS'] = gpuProcessObj gpuInfoObj[str(dCnt)] = deviceInfoObj N.nvmlShutdown() except N.NVMLError as err: N.nvmlShutdown() print(err) gpuInfoObj = {} self._gpuInfoObj = gpuInfoObj return gpuInfoObj