def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def get_stats(self): """ Get system statistics and assign to `self` """ memory_usage = psutil.virtual_memory() disk_usage = psutil.disk_usage('/') # net = psutil.net_io_counters() system = { # CPU utilization percent(can be over 100%) 'cpu': round10e5(self._process.cpu_percent(0.0)), # Whole system memory usage # 'memory_used': round10e5(memory_usage.used / 1024 / 1024), 'memory_percent': round10e5(memory_usage.used * 100 / memory_usage.total), # Get the portion of memory occupied by a process # 'p_memory_rss': round10e5(self._process.memory_info().rss # / 1024 / 1024), 'p_memory_percent': round10e5(self._process.memory_percent()), # Disk usage # 'disk_used': round10e5(disk_usage.used / 1024 / 1024), 'disk_percent': round10e5(disk_usage.percent), } # Collect GPU statistics gpus = [] try: gpu_device_count = nvml.nvmlDeviceGetCount() for i in range(gpu_device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) nvml_tmp = nvml.NVML_TEMPERATURE_GPU # Get device memory and temperature util = nvml.nvmlDeviceGetUtilizationRates(handle) memory = nvml.nvmlDeviceGetMemoryInfo(handle) temp = nvml.nvmlDeviceGetTemperature(handle, nvml_tmp) # Compute power usage in watts and percent power_watts = nvml.nvmlDeviceGetPowerUsage(handle) / 1000 power_cap = nvml.nvmlDeviceGetEnforcedPowerLimit(handle) power_cap_watts = power_cap / 1000 power_watts / power_cap_watts * 100 gpus.append({ # GPU utilization percent 'gpu': round10e5(util.gpu), # Device memory usage # 'memory_used': round10e5(memory.used / 1024 / 1024), 'gpu_memory_percent': round10e5(memory.used * 100 / memory.total), # Power usage in watts and percent 'gpu_power_watts': round10e5(power_watts), # 'power_percent': round10e5(power_usage), # Device temperature 'gpu_temp': round10e5(temp), }) except Exception: pass return system, gpus
def power_usage(self, hnd): return nv.nvmlDeviceGetPowerUsage(hnd) / 1000
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def _get_power_usage_watts(gpu): return {'power_watts': (pynvml.nvmlDeviceGetPowerUsage(gpu) / 1000.0)}
def getGpuInfo(self): if (self._impulse % 2) != 0: return self._gpuInfoObj try: N.nvmlInit() gpuInfoObj = {} driverVersion = N.nvmlSystemGetDriverVersion() deviceCnt = N.nvmlDeviceGetCount() gpuInfoObj['DRIVER_VERSION'] = driverVersion gpuInfoObj['DEVICE_COUNT'] = deviceCnt for dCnt in range(deviceCnt): deviceInfoObj = {} handle = N.nvmlDeviceGetHandleByIndex(dCnt) name = N.nvmlDeviceGetName(handle) try: fan = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as err: fan = 'N/A' try: temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError as err: temp = 'N/A' try: powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000) except N.NVMLError as err: powerUsage = 'N/A' try: powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000) except N.NVMLError as err: powerLimit = 'N/A' try: memInfo = N.nvmlDeviceGetMemoryInfo(handle) memUsage = round(memInfo.used/1024/1024) memTotal = round(memInfo.total/1024/1024) except N.NVMLError as err: memUsage = 'N/A' memTotal = 'N/A' try: util = N.nvmlDeviceGetUtilizationRates(handle).gpu except N.NVMLError as err: util = 'N/A' deviceInfoObj['NAME'] = name deviceInfoObj['FAN'] = fan deviceInfoObj['TEMP'] = temp deviceInfoObj['POWER_USAGE'] = powerUsage deviceInfoObj['POWER_LIMIT'] = powerLimit deviceInfoObj['MEM_USAGE'] = memUsage deviceInfoObj['MEM_TOTAL'] = memTotal deviceInfoObj['UTIL'] = util gpuProcessObj = {} try: processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError as err: processes = [] for pCnt, process in enumerate(processes): gpuMem = round(process.usedGpuMemory / 1024 / 1024) pid = process.pid try: p = psutil.Process(pid) attrs = p.as_dict(attrs = ['name', 'username', 'status']) except psutil.ZombieProcess: attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'} except: pass gpuProcessObj[str(pCnt)] = { 'PID': pid, 'MEM': gpuMem, 'NAME': attrs['name'], 'USERNAME': self._getSubuidName(attrs['username']), 'STATUS': attrs['status'] } deviceInfoObj['PROCESS'] = gpuProcessObj gpuInfoObj[str(dCnt)] = deviceInfoObj N.nvmlShutdown() except N.NVMLError as err: N.nvmlShutdown() print(err) gpuInfoObj = {} self._gpuInfoObj = gpuInfoObj return gpuInfoObj
class NvidiaDeviceStatistics(Callback): reportable_values = dict( memory_total=lambda handle: _bytes_to_megabytes( nvml.nvmlDeviceGetMemoryInfo(handle).total), memory_used=lambda handle: _bytes_to_megabytes( nvml.nvmlDeviceGetMemoryInfo(handle).used), memory_free=lambda handle: _bytes_to_megabytes( nvml.nvmlDeviceGetMemoryInfo(handle).total - nvml. nvmlDeviceGetMemoryInfo(handle).used), temperature=lambda handle: nvml.nvmlDeviceGetTemperature( handle, nvml.NVML_TEMPERATURE_GPU), power_state=lambda handle: nvml.nvmlDeviceGetPowerState(handle), power_draw=lambda handle: nvml.nvmlDeviceGetPowerUsage(handle) / 1000.0, utilization_gpu=lambda handle: nvml.nvmlDeviceGetUtilizationRates( handle).gpu, utilization_memory=lambda handle: nvml.nvmlDeviceGetUtilizationRates( handle).memory, ) def __init__(self, report=None, devices=None, quiet=False, always_suffix=False, output=print, verbose_once=True): super(self.__class__, self).__init__() global nvml self.output = output if nvml is not None: try: nvml.nvmlInit() except (OSError, nvml.NVMLError_LibraryNotFound): # the python library might be installed, but not the drivers... nvml = None if nvml is None: if not quiet: self.output( "Could not load py3nvml, cannot report any nvidia device statistics." ) report = [] else: device_count = nvml.nvmlDeviceGetCount() if devices is None: devices = list(range(device_count)) else: devices = [ int(device) for device in devices if 0 <= int(device) < device_count ] self.devices = devices self.deviceHandles = [ nvml.nvmlDeviceGetHandleByIndex(device) for device in devices ] if not quiet: for n, handle in enumerate(self.deviceHandles): self.output("Collecting statistics for device #% 2d: %s" % (n, nvml.nvmlDeviceGetName(handle))) if report is None: report = ['temperature', 'utilization_gpu'] elif report == 'all': report = list(self.reportable_values.keys()) self.verbose_once = verbose_once self.report = report self.always_suffix = always_suffix def __del__(self): if nvml: try: nvml.nvmlShutdown() except Exception: pass def on_epoch_end(self, epoch, logs=None): for item in self.report: try: suffix = handle = None for n, handle in enumerate(self.deviceHandles): if len(self.deviceHandles) == 1 and not self.always_suffix: suffix = '' else: suffix = '_%02d' % ( n, ) # TODO: this will not work nicely if more than 100 GPUs are in one sys logs[item + suffix] = np.float32( self.reportable_values[item](handle)) except nvml.NVMLError as err: self.output("Error trying to read out value from NVML: %r" % (err, )) if self.report and self.verbose_once: self.output("Current status for device #% 2d (%s): %r" % (n, nvml.nvmlDeviceGetName(handle), { what: float(call(handle)) for what, call in self.reportable_values.items() })) self.verbose_once = False # only print once