def get_gpu_info_by_nvml(self) -> Dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None try: nvmlInit() driver_version = nvmlSystemGetDriverVersion() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) gpu_info = {} gpu_info["memory_total"] = info.total gpu_info["memory_available"] = info.free gpu_info["name"] = nvmlDeviceGetName(handle) gpu_info_list.append(gpu_info) nvmlShutdown() except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None result = {"driver_version": driver_version, "devices": gpu_info_list} if 'CUDA_VISIBLE_DEVICES' in environ: result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES'] return result
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]: """ Get driver version and list of ``GpuInfo``, if available. """ try: nvml.nvmlInit() except nvml.NVMLError: # Not available. return None, None driver_version: str = nvml.nvmlSystemGetDriverVersion() gpus: List[GpuInfo] = [] device_count: int = nvml.nvmlDeviceGetCount() for i in range(device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) name = try_get_info(nvml.nvmlDeviceGetName, handle) fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0) temp = try_get_info( lambda h: nvml.nvmlDeviceGetTemperature(h, nvml. NVML_TEMPERATURE_GPU), handle, default=0, ) mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) if mem_info: mem_used = mem_info.used >> 20 mem_total = mem_info.total >> 20 else: mem_used = 0 mem_total = 0 util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) if util: gpu_util = util.gpu else: gpu_util = 0 gpus.append( GpuInfo( id=i, name=name, mem_usage=mem_used, mem_capacity=mem_total, utilization=gpu_util, temp=temp, fan=fan_speed, )) nvml.nvmlShutdown() return driver_version, gpus
def test_nvidia(): # pip install py3nvml import py3nvml from py3nvml import py3nvml as nvml inspect(py3nvml.get_free_gpus()) nvml.nvmlInit() inspect(version=nvml.nvmlSystemGetDriverVersion()) inspect(count=nvml.nvmlDeviceGetCount()) for i in range(nvml.nvmlDeviceGetCount()): test_nvidia_device(i) nvml.nvmlShutdown()
def __init__(self): self.labels = ['gpu', 'name', 'driver'] self.driver = nv.nvmlSystemGetDriverVersion() self.n_gpu = nv.nvmlDeviceGetCount() self.hnds = [ nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu) ] self.args = [] for i, hnd in enumerate(self.hnds): args = OrderedDict() args['gpu'] = 'gpu%d' % i args['name'] = nv.nvmlDeviceGetName(hnd) args['driver'] = self.driver self.args.append(args)
def gpu_status(): try: py3nvml.nvmlInit() device_count = py3nvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): gpu = {} handle = py3nvml.nvmlDeviceGetHandleByIndex(i) memory = _nmvl_call( partial(py3nvml.nvmlDeviceGetMemoryInfo, handle)) if memory: memory = round(memory.total * 1.0 / 2**30, 2) gpu['name'] = _nmvl_call(partial(py3nvml.nvmlDeviceGetName, handle)) gpu['clock'] = _nmvl_call( partial(py3nvml.nvmlDeviceGetApplicationsClock, handle, py3nvml.NVML_CLOCK_GRAPHICS)) gpu['clock_mem'] = _nmvl_call( partial(py3nvml.nvmlDeviceGetApplicationsClock, handle, py3nvml.NVML_CLOCK_MEM)) gpu['clock_max'] = _nmvl_call( partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle, py3nvml.NVML_CLOCK_GRAPHICS)) gpu['clock_mem_max'] = _nmvl_call( partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle, py3nvml.NVML_CLOCK_MEM)) gpu['memory'] = memory devices.append(gpu) nvidia = { 'driver_version': py3nvml.nvmlSystemGetDriverVersion(), 'devices': devices } return nvidia except Exception as e: return None
def __init__(self): py3nvml.nvmlInit() self.driver_version = py3nvml.nvmlSystemGetDriverVersion() self.gpus = GpuList() self.update()
#!/usr/bin/env python3 # need package: py3nvml # if you use python 2, you need nvidia-ml-py and change the import from __future__ import print_function # import pynvml import py3nvml.py3nvml as pynvml import datetime pynvml.nvmlInit() print("Driver Version:", pynvml.nvmlSystemGetDriverVersion()) deviceCount = pynvml.nvmlDeviceGetCount() for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) print("Device {}: {}".format(i, pynvml.nvmlDeviceGetName(handle))) pynvml.nvmlShutdown()
def _get_driver_version(): return {'driver_version': pynvml.nvmlSystemGetDriverVersion()}
def getGpuInfo(self): if (self._impulse % 2) != 0: return self._gpuInfoObj try: N.nvmlInit() gpuInfoObj = {} driverVersion = N.nvmlSystemGetDriverVersion() deviceCnt = N.nvmlDeviceGetCount() gpuInfoObj['DRIVER_VERSION'] = driverVersion gpuInfoObj['DEVICE_COUNT'] = deviceCnt for dCnt in range(deviceCnt): deviceInfoObj = {} handle = N.nvmlDeviceGetHandleByIndex(dCnt) name = N.nvmlDeviceGetName(handle) try: fan = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as err: fan = 'N/A' try: temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError as err: temp = 'N/A' try: powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000) except N.NVMLError as err: powerUsage = 'N/A' try: powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000) except N.NVMLError as err: powerLimit = 'N/A' try: memInfo = N.nvmlDeviceGetMemoryInfo(handle) memUsage = round(memInfo.used/1024/1024) memTotal = round(memInfo.total/1024/1024) except N.NVMLError as err: memUsage = 'N/A' memTotal = 'N/A' try: util = N.nvmlDeviceGetUtilizationRates(handle).gpu except N.NVMLError as err: util = 'N/A' deviceInfoObj['NAME'] = name deviceInfoObj['FAN'] = fan deviceInfoObj['TEMP'] = temp deviceInfoObj['POWER_USAGE'] = powerUsage deviceInfoObj['POWER_LIMIT'] = powerLimit deviceInfoObj['MEM_USAGE'] = memUsage deviceInfoObj['MEM_TOTAL'] = memTotal deviceInfoObj['UTIL'] = util gpuProcessObj = {} try: processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError as err: processes = [] for pCnt, process in enumerate(processes): gpuMem = round(process.usedGpuMemory / 1024 / 1024) pid = process.pid try: p = psutil.Process(pid) attrs = p.as_dict(attrs = ['name', 'username', 'status']) except psutil.ZombieProcess: attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'} except: pass gpuProcessObj[str(pCnt)] = { 'PID': pid, 'MEM': gpuMem, 'NAME': attrs['name'], 'USERNAME': self._getSubuidName(attrs['username']), 'STATUS': attrs['status'] } deviceInfoObj['PROCESS'] = gpuProcessObj gpuInfoObj[str(dCnt)] = deviceInfoObj N.nvmlShutdown() except N.NVMLError as err: N.nvmlShutdown() print(err) gpuInfoObj = {} self._gpuInfoObj = gpuInfoObj return gpuInfoObj