def gpu_info() -> dict: info = dict() try: nvmlInit() except NVMLError: info['no-gpu'] = 'No Nvidia GPU detected' return info device_count = nvmlDeviceGetCount() info['driver_version'] = nvmlSystemGetDriverVersion().decode() info['device_count'] = device_count info['device'] = dict() for i in range(device_count): handle = nvmlDeviceGetHandleByIndex(i) memory = nvmlDeviceGetMemoryInfo(handle) info['device'][i] = dict() info['device'][i]['name'] = str(nvmlDeviceGetName(handle)) info['device'][i]['memory'] = dict() info['device'][i]['memory']['total'] = str(size_in_gb(memory.total)) nvmlShutdown() return info
def getGPUUsage(): try: pynvml.nvmlInit() count = pynvml.nvmlDeviceGetCount() if count == 0: return None result = { "driver": pynvml.nvmlSystemGetDriverVersion(), "gpu_count": int(count) } i = 0 gpuData = [] while i < count: handle = pynvml.nvmlDeviceGetHandleByIndex(i) mem = pynvml.nvmlDeviceGetMemoryInfo(handle) gpuData.append({ "device_num": i, "name": pynvml.nvmlDeviceGetName(handle), "total": round(float(mem.total) / 1000000000, 2), "used": round(float(mem.used) / 1000000000, 2) }) i = i + 1 result["devices"] = jsonpickle.encode(gpuData, unpicklable=False) except Exception as e: result = {"driver": "No GPU!", "gpu_count": 0, "devices": []} return result
def get_driver(): """ Get the driver version """ try: driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8") except pynvml.NVMLError: driver = "No Nvidia driver found" return driver
def _update_nvml_static_info(): driver_version = pynvml.nvmlSystemGetDriverVersion().decode() nvml_version = pynvml.nvmlSystemGetNVMLVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] devices_handles = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) devices.append({ 'index': i, 'name': name, 'memory': { 'total': mem_info.total } }) devices_handles.append(handle) _static_info['public'].update({ 'gpu': { 'driver': driver_version, 'nvml': nvml_version, 'devices': devices } }) _static_info['private'].update({'gpu': {'handles': devices_handles}})
def get_driver_version(): """ Return current NVIDIA driver version """ if not pynvml._nvmlLib_refcount: pynvml.nvmlInit() return pynvml.nvmlSystemGetDriverVersion()
def gpu_info(self): # pip install nvidia-ml-py3 if len(self.gpu_ids) >= 0 and torch.cuda.is_available(): try: import pynvml pynvml.nvmlInit() self.config_dic[ 'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion( ) for gpu_id in self.gpu_ids: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) gpu_id_name = "gpu%s" % gpu_id mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle) self.config_dic['%s_device_name' % gpu_id_name] = pynvml.nvmlDeviceGetName( handle) self.config_dic['%s_mem_total' % gpu_id_name] = gpu_mem_total = round( mem_info.total / 1024**3, 2) self.config_dic['%s_mem_used' % gpu_id_name] = gpu_mem_used = round( mem_info.used / 1024**3, 2) # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2 self.config_dic['%s_mem_percent' % gpu_id_name] = round( (gpu_mem_used / gpu_mem_total) * 100, 1) self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name, gpu_utilize.gpu, 0.8) # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory pynvml.nvmlShutdown() except Exception as e: print(e)
def initialize(self, **kwargs) -> None: try: nvmlInit() driver_version = nvmlSystemGetDriverVersion().decode("UTF-8") nvml_version = nvmlSystemGetNVMLVersion().decode("UTF-8") self.logger.info(f"NVML initialized, driver version: {driver_version}, NVML version: {nvml_version}") self.detect_devices() except NVMLError as error: self.raise_nvml_error(error)
def run_logging_loop(async_task, async_loop): asyncio.set_event_loop(async_loop) pynvml.nvmlInit() logger = _logger() logger.info("Driver Version: {}".format( nativestr(pynvml.nvmlSystemGetDriverVersion()))) async_loop.run_until_complete(async_task) logger.info("Shutting down driver") pynvml.nvmlShutdown()
def check_perf(): "Suggest how to improve the setup to speed things up" from PIL import features, Image from packaging import version import pynvml print("Running performance checks.") # libjpeg_turbo check print("\n*** libjpeg-turbo status") if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"): if features.check_feature('libjpeg_turbo'): print("✔ libjpeg-turbo is on") else: print("✘ libjpeg-turbo is not on. It's recommended you install libjpeg-turbo to speed up JPEG decoding. See https://docs.fast.ai/performance.html#libjpeg-turbo") else: print(f"❓ libjpeg-turbo's status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {Image.PILLOW_VERSION}") # XXX: remove this check/note once Pillow and Pillow-SIMD 5.4.0 is available pillow_ver_5_4_is_avail = pypi_module_version_is_available("Pillow", "5.4.0") if pillow_ver_5_4_is_avail == False: print("5.4.0 is not yet available, other than the dev version on github, which can be installed via pip from git+https://github.com/python-pillow/Pillow. See https://docs.fast.ai/performance.html#libjpeg-turbo") # Pillow-SIMD check print("\n*** Pillow-SIMD status") if re.search(r'\.post\d+', Image.PILLOW_VERSION): print(f"✔ Running Pillow-SIMD {Image.PILLOW_VERSION}") else: print(f"✘ Running Pillow {Image.PILLOW_VERSION}; It's recommended you install Pillow-SIMD to speed up image resizing and other operations. See https://docs.fast.ai/performance.html#pillow-simd") # CUDA version check # compatibility table: k: min nvidia ver is required for v: cuda ver # note: windows nvidia driver version is slightly higher, see: # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html # note: add new entries if pytorch starts supporting new cudaXX nvidia2cuda = { "410.00": "10.0", "384.81": "9.0", "367.48": "8.0", } print("\n*** CUDA status") if torch.cuda.is_available(): pynvml.nvmlInit() nvidia_ver = pynvml.nvmlSystemGetDriverVersion().decode('utf-8') cuda_ver = torch.version.cuda max_cuda = "8.0" for k in sorted(nvidia2cuda.keys()): if version.parse(nvidia_ver) > version.parse(k): max_cuda = nvidia2cuda[k] if version.parse(str(max_cuda)) <= version.parse(cuda_ver): print(f"✔ Running the latest CUDA {cuda_ver} with NVIDIA driver {nvidia_ver}") else: print(f"✘ You are running pytorch built against cuda {cuda_ver}, your NVIDIA driver {nvidia_ver} supports cuda10. See https://pytorch.org/get-started/locally/ to install pytorch built against the faster CUDA version.") else: print(f"❓ Running cpu-only torch version, CUDA check is not relevant") print("\nRefer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions.")
def get_driver(): """ Get the driver version """ if is_macos: driver = pynvx.cudaSystemGetDriverVersion(ignore=True) else: try: driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8") except pynvml.NVMLError: driver = "No Nvidia driver found" return driver
def _get_driver_version(self): self._nvml_init() try: driver_version = self._decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None self._nvml_shutdown() return driver_version
def check_nvidia_device(): try: pynvml.nvmlInit() driver_version = float(pynvml.nvmlSystemGetDriverVersion()) pynvml.nvmlShutdown() if driver_version < 367.48: raise OSError( 'NVIDIA driver v.{} is not supported. The driver version must be 367.48 or newer' .format(driver_version)) except pynvml.NVMLError: raise OSError('NVIDIA device not found')
def get_driver(self): """ Get the driver version """ if IS_MACOS: driver = pynvx.cudaSystemGetDriverVersion(ignore=True) else: try: driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8") except pynvml.NVMLError: driver = "No Nvidia driver found" if self.logger: self.logger.debug("GPU Driver: %s", driver) return driver
def get_gpu_info(handle): # """ # input: handle of GPU # output:cuda version,gpu name,total memory,used_memory,free memory, gpu_util_rate # """ # https://docs.nvidia.com/deploy/nvml-api/ info = pynvml.nvmlDeviceGetMemoryInfo(handle) total_memory = info.total free_memory = info.free used_memory = info.used utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util_rate = utilization.gpu return pynvml.nvmlSystemGetDriverVersion(), pynvml.nvmlDeviceGetName(handle), total_memory, free_memory, used_memory, gpu_util_rate
def get_system_info(): system_info = dict() # cpu info system_info['cpu_percent'] = psutil.cpu_percent(interval=None, percpu=False) system_info['cpu_count'] = psutil.cpu_count(logical=True) # memory info mem = psutil.virtual_memory() system_info['mem_total'] = int(mem.total / 1024 / 1024) system_info['mem_available'] = int(mem.available / 1024 / 1024) system_info['mem_percent'] = mem.percent # disk info disk = psutil.disk_usage('/') system_info['disk_total'] = int(disk.total / 1024 / 1024) system_info['disk_used'] = int(disk.used / 1024 / 1024) system_info['disk_percent'] = disk.percent # other info system_info['boot_time'] = psutil.boot_time() # gpu info if tf.test.is_gpu_available(): pynvml.nvmlInit() gpu_driver_version = pynvml.nvmlSystemGetDriverVersion() system_info['gpu_driver_version'] = gpu_driver_version.decode("utf-8") gpu_device_count = pynvml.nvmlDeviceGetCount() system_info['gpu_device_list'] = [] for i in range(gpu_device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_name = pynvml.nvmlDeviceGetName(handle) gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem_total = int(gpu_mem.total / 1024 / 1024) gpu_mem_used = int(gpu_mem.used / 1024 / 1024) gpu_mem_percent = int(gpu_mem_used / gpu_mem_total) system_info['gpu_device_list'].append( {'gpu_name': gpu_name.decode("utf-8"), 'gpu_mem_total': gpu_mem_total, 'gpu_mem_used': gpu_mem_used, 'gpu_mem_percent': gpu_mem_percent } ) pynvml.nvmlShutdown() return system_info
def _get_driver(self) -> str: """ Obtain the Nvidia driver version currently in use. Returns ------- str The current GPU driver version """ try: driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8") except pynvml.NVMLError as err: self._log("debug", f"Unable to obtain driver. Original error: {str(err)}") driver = "No Nvidia driver found" self._log("debug", f"GPU Driver: {driver}") return driver
def get_nvml_driver_version(): try: from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion try: nvmlInit() v = nvmlSystemGetDriverVersion() log("nvmlSystemGetDriverVersion=%s", v) return v.split(".") except Exception as e: log.warn("Warning: failed to query the NVidia kernel module version via NVML:") log.warn(" %s", e) finally: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return ""
def GetGPUstate(self): pynvml.nvmlInit() gpudriver_info = pynvml.nvmlSystemGetDriverVersion() # 获取驱动信息 gpu_count = pynvml.nvmlDeviceGetCount() # 显卡数量 handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 这里的0是GPU id gpu_name = pynvml.nvmlDeviceGetName(handle) # 显卡型号 meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) line = { 'gpudriver_info': gpudriver_info, 'gpu_count': gpu_count, # 显卡数量 'gpu_name': gpu_name, # 显卡型号 'gpumem_all': int(meminfo.total / 1024 / 1024), # 总显存大小 'gpumem_use': int(meminfo.used / 1024 / 1024), # 已用显存 'gpumem_free': int(meminfo.free / 1024 / 1024), # 剩余显存 'cur_gpu': float(meminfo.used / meminfo.total) # 显存使用率 } return line
def is_of_supported(device_id=0): global is_of_supported_var if is_of_supported_var is not None: return is_of_supported_var driver_version_major = 0 try: import pynvml pynvml.nvmlInit() driver_version = pynvml.nvmlSystemGetDriverVersion().decode('utf-8') driver_version_major = int(driver_version.split('.')[0]) except ModuleNotFoundError: print("NVML not found") # there is an issue with OpticalFlow driver in R495 and newer on aarch64 platform is_of_supported_var = get_arch(device_id) >= 7.5 and ( platform.machine() == "x86_64" or driver_version_major < 495) return is_of_supported_var
def get_nvml_driver_version(): try: from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion try: if wrap_nvml_init(nvmlInit): v = nvmlSystemGetDriverVersion() log("nvmlSystemGetDriverVersion=%s", bytestostr(v)) return v.split(b".") except Exception as e: log("get_nvml_driver_version() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia kernel module version using NVML:") log.warn(" %s", e) finally: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return ()
def _get_driver(self): """ Obtain and return the installed driver version for the system's GPUs. Returns ------- str The currently installed GPU driver version """ if self._is_plaidml: driver = self._plaid.drivers elif IS_MACOS: driver = pynvx.cudaSystemGetDriverVersion(ignore=True) else: try: driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8") except pynvml.NVMLError: driver = "No Nvidia driver found" self._log("debug", "GPU Driver: {}".format(driver)) return driver
def get_nvml_driver_version(): try: from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion try: nvmlInit() v = nvmlSystemGetDriverVersion() log("nvmlSystemGetDriverVersion=%s", v) return v.split(".") except Exception as e: log.warn( "Warning: failed to query the NVidia kernel module version via NVML:" ) log.warn(" %s", e) finally: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return ""
def get_machine_config(): """Get machine config for CPU and GPU(s)""" # CPU config physical_cores = psutil.cpu_count(logical=False) logical_cores = psutil.cpu_count(logical=True) cpufreq = psutil.cpu_freq() cpufreq_max = cpufreq.max # Mhz cpufreq_min = cpufreq.min cpufreq_cur = cpufreq.current svmem = psutil.virtual_memory() mem_total = svmem.total / (1024.0**3) # GB mem_avail = svmem.available / (1024.0**3) # GPU config nv.nvmlInit() driver_version = nv.nvmlSystemGetDriverVersion() deviceCount = nv.nvmlDeviceGetCount() gpu_devices, gpu_mems = [], [] for i in range(deviceCount): handle = nv.nvmlDeviceGetHandleByIndex(i) gpu_devices.append(nv.nvmlDeviceGetName(handle).decode("utf-8")) gpu_mem = nv.nvmlDeviceGetMemoryInfo(handle).total / (1024.0**3) gpu_mems.append(gpu_mem) return { 'cpu': { 'physical_cores': physical_cores, 'logical_cores': logical_cores, 'min_freq_MHz': cpufreq_min, 'max_freq_MHz': cpufreq_max, 'cur_freq_MHz': cpufreq_cur, 'total_mem_GB': mem_total, 'avail_mem_GB': mem_avail }, 'gpu': { 'devices': gpu_devices, 'mem_GB': gpu_mems } }
def gputask(): def get(index): try: handle = pynvml.nvmlDeviceGetHandleByIndex(index) except pynvml.NVMLError_GpuIsLost: return None memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) return dict( nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'), nvmlDeviceGetMemoryInfo=dict( total=memory_info.total, free=memory_info.free, used=memory_info.used, ), nvmlDeviceGetUtilizationRates=get_utilization_rates(handle), nvmlDeviceGetFanSpeed=get_fan_speed(handle), nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU), nvmlDeviceGetTemperatureThreshold=dict( slowdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN), shutdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN), ), nvmlDeviceGetPowerManagementLimit=pynvml. nvmlDeviceGetPowerManagementLimit(handle), nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle), ) try: pynvml.nvmlInit() res = dict( nvml_version=pynvml.nvmlSystemGetDriverVersion().decode(), nvmlDeviceGetCount=pynvml.nvmlDeviceGetCount(), nvmlDevices=[get(i) for i in range(pynvml.nvmlDeviceGetCount())], ) pynvml.nvmlShutdown() return res except Exception: return dict(nvml_version=None, )
def get_nv_info(): nv_info = dict() try: nvmlInit() nv_info["_Driver_Version"] = str(nvmlSystemGetDriverVersion(), errors="ignore") nv_info["_NVML_Version"] = str(nvmlSystemGetNVMLVersion(), errors="ignore") device_count = nvmlDeviceGetCount() nv_info["Device_Count"] = device_count devices = [] for i in range(device_count): dev_info = dict() handle = nvmlDeviceGetHandleByIndex(i) dev_info["_Name"] = str(nvmlDeviceGetName(handle), errors="ignore") memory_info = nvmlDeviceGetMemoryInfo(handle) dev_info["Total_Memory"] = memory_info.total dev_info["Free_Memory"] = memory_info.free dev_info["Used_Memory"] = memory_info.used util_rates = nvmlDeviceGetUtilizationRates(handle) dev_info["GPU_Utilization_Rate"] = util_rates.gpu dev_info["Memory_Utilization_Rate"] = util_rates.memory devices.append(dev_info) nv_info["Devices"] = devices nvmlShutdown() except Exception as e: nv_info["Exception"] = str(e) return nv_info
def is_of_supported(device_id=0): global is_of_supported_var if is_of_supported_var is not None: return is_of_supported_var compute_cap = 0 driver_version_major = 0 try: import pynvml pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) compute_cap = pynvml.nvmlDeviceGetCudaComputeCapability(handle) compute_cap = compute_cap[0] + compute_cap[1] / 10. driver_version = pynvml.nvmlSystemGetDriverVersion().decode('utf-8') driver_version_major = int(driver_version.split('.')[0]) except ModuleNotFoundError: print("NVML not found") # there is an issue with OpticalFlow driver in R495 and newer on aarch64 platform is_of_supported_var = compute_cap >= 7.5 and ( platform.machine() == "x86_64" or driver_version_major < 495) return is_of_supported_var
def gpu_info(): try: pynvml.nvmlInit() # print("Driver Version:", pynvml.nvmlSystemGetDriverVersion()) #gpu version deviceCount = pynvml.nvmlDeviceGetCount() #gpu count gpu_info = {} for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) # print("Device", i, ":", pynvml.nvmlDeviceGetName(handle)) #gpu name gpu_info[str(i)] = { 'version': str(pynvml.nvmlSystemGetDriverVersion()), 'name': str(pynvml.nvmlDeviceGetName(handle)), 'used': meminfo.used / 1024 / 1024 / 1024, 'free': meminfo.free / 1024 / 1024 / 1024, 'total': meminfo.total / 1024 / 1024 / 1024, 'percent': (meminfo.used) / (meminfo.total) * 100 } pynvml.nvmlShutdown() return gpu_info except Exception as e: logging.info(f'GPU Erorr:[{e}]')
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def check(self): ## Check legacy mode try: self.legacy = self.configuration['legacy'] if self.legacy == '': raise KeyError if self.legacy is True: self.info('Legacy mode set to True') except KeyError: self.legacy = False self.info("No legacy mode specified. Setting to 'False'") ## Real memory clock is double (DDR double data rate ram). Set nvMemFactor = 2 in conf for 'real' memory clock try: self.nvMemFactor = int(self.configuration['nvMemFactor']) if self.nvMemFactor == '': raise KeyError self.info("'nvMemFactor' set to:", str(self.nvMemFactor)) except Exception as e: if isinstance(e, KeyError): self.info("No 'nvMemFactor' configured. Setting to 1") else: self.error( "nvMemFactor in config file is not an int. Setting 'nvMemFactor' to 1", str(e)) self.nvMemFactor = 1 ## Initialize NVML try: pynvml.nvmlInit() self.info("Nvidia Driver Version:", str(pynvml.nvmlSystemGetDriverVersion())) except Exception as e: self.error("pynvml could not be initialized", str(e)) pynvml.nvmlShutdown() return False ## Get number of graphic cards try: self.unitCount = pynvml.nvmlUnitGetCount() self.deviceCount = pynvml.nvmlDeviceGetCount() self.debug("Unit count:", str(self.unitCount)) self.debug("Device count", str(self.deviceCount)) except Exception as e: self.error('Error getting number of Nvidia GPUs', str(e)) pynvml.nvmlShutdown() return False ## Get graphic card names data = self._get_data() name = '' for i in range(self.deviceCount): if i == 0: name = name + str( data["device_name_" + str(i)]) + " [{0}]".format(i) else: name = name + ' | ' + str( data["device_name_" + str(i)]) + " [{0}]".format(i) self.info('Graphics Card(s) found:', name) for chart in self.definitions: self.definitions[chart]['options'][ 1] = self.definitions[chart]['options'][1] + ' for ' + name ## Dynamically add lines for i in range(self.deviceCount): gpuIdx = str(i) ## Memory if data['device_mem_used_' + str(i)] is not None: self.definitions['memory']['lines'].append([ 'device_mem_free_' + gpuIdx, 'free [{0}]'.format(i), 'absolute', 1, 1024**2 ]) self.definitions['memory']['lines'].append([ 'device_mem_used_' + gpuIdx, 'used [{0}]'.format(i), 'absolute', 1, 1024**2 ]) # self.definitions['memory']['lines'].append(['device_mem_total_' + gpuIdx, 'GPU:{0} total'.format(i), 'absolute', -1, 1024**2]) ## Load/usage if data['device_load_gpu_' + gpuIdx] is not None: self.definitions['load']['lines'].append([ 'device_load_gpu_' + gpuIdx, 'gpu [{0}]'.format(i), 'absolute' ]) self.definitions['load']['lines'].append([ 'device_load_mem_' + gpuIdx, 'memory [{0}]'.format(i), 'absolute' ]) ## Encoder Utilization if data['device_load_enc_' + gpuIdx] is not None: self.definitions['load']['lines'].append([ 'device_load_enc_' + gpuIdx, 'enc [{0}]'.format(i), 'absolute' ]) ## Decoder Utilization if data['device_load_dec_' + gpuIdx] is not None: self.definitions['load']['lines'].append([ 'device_load_dec_' + gpuIdx, 'dec [{0}]'.format(i), 'absolute' ]) ## ECC errors if data['device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_' + gpuIdx] is not None: self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_' + gpuIdx, 'L1 Cache Volatile Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_' + gpuIdx, 'L1 Cache Volatile Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_' + gpuIdx, 'L1 Cache Aggregate Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_' + gpuIdx, 'L1 Cache Aggregate Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_' + gpuIdx, 'L2 Cache Volatile Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_' + gpuIdx, 'L2 Cache Volatile Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_' + gpuIdx, 'L2 Cache Aggregate Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_' + gpuIdx, 'L2 Cache Aggregate Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_' + gpuIdx, 'Device Memory Volatile Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_' + gpuIdx, 'Device Memory Volatile Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_' + gpuIdx, 'Device Memory Aggregate Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_' + gpuIdx, 'Device Memory Aggregate Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_' + gpuIdx, 'Register File Volatile Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_' + gpuIdx, 'Register File Volatile Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_' + gpuIdx, 'Register File Aggregate Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_' + gpuIdx, 'Register File Aggregate Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_' + gpuIdx, 'Texture Memory Volatile Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_' + gpuIdx, 'Texture Memory Volatile Uncorrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_' + gpuIdx, 'Texture Memory Aggregate Corrected [{0}]'.format(i), 'absolute' ]) self.definitions['ecc_errors']['lines'].append([ 'device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_' + gpuIdx, 'Texture Memory Aggregate Uncorrected [{0}]'.format(i), 'absolute' ]) ## Temperature if data['device_temp_' + gpuIdx] is not None: self.definitions['temperature']['lines'].append( ['device_temp_' + gpuIdx, 'GPU:{0}'.format(i), 'absolute']) ## Fan if data['device_fanspeed_' + gpuIdx] is not None: self.definitions['fan']['lines'].append([ 'device_fanspeed_' + gpuIdx, 'GPU:{0}'.format(i), 'absolute' ]) ## GPU and Memory frequency if data['device_core_clock_' + gpuIdx] is not None: self.definitions['frequency']['lines'].append([ 'device_core_clock_' + gpuIdx, 'core [{0}]'.format(i), 'absolute' ]) self.definitions['frequency']['lines'].append([ 'device_mem_clock_' + gpuIdx, 'memory [{0}]'.format(i), 'absolute' ]) ## SM frequency, usually same as GPU - handled extra here because of legacy mode if data['device_sm_clock_' + gpuIdx] is not None: self.definitions['frequency']['lines'].append([ 'device_sm_clock_' + gpuIdx, 'sm [{0}]'.format(i), 'absolute' ]) ## Check if GPU Units are installed and add charts if self.unitCount: self.order.append('unit_fan') self.order.append('unit_psu') for i in range(self.unitCount): gpuIdx = str(i) if data['unit_temp_intake_' + gpuIdx] is not None: self.definitions['temperature']['lines'].append([ 'unit_temp_intake_' + gpuIdx, 'intake (unit {0})'.format(i), 'absolute' ]) self.definitions['temperature']['lines'].append([ 'unit_temp_exhaust_' + gpuIdx, 'exhaust (unit {0})'.format(i), 'absolute' ]) self.definitions['temperature']['lines'].append([ 'unit_temp_board_' + gpuIdx, 'board (unit {0})'.format(i), 'absolute' ]) if data['unit_fan_speed_' + gpuIdx] is not None: self.definitions['unit_fan'] = { 'options': [ None, 'Unit fan', 'rpm', 'Unit Fans', 'nv.unit', 'line' ], 'lines': [[ 'unit_fan_speed_' + gpuIdx, 'Unit{0}'.format(i), 'absolute' ]] } if data['unit_psu_current_' + gpuIdx] is not None: self.definitions['unit_psu'] = { 'options': [ None, 'Unit PSU', 'mixed', 'Unit PSU', 'nv.unit', 'line' ], 'lines': [[ 'unit_psu_current_' + gpuIdx, 'current (A) (unit {0})'.format(i), 'absolute' ], [ 'unit_psu_power_' + gpuIdx, 'power (W) (unit {0})'.format(i), 'absolute' ], [ 'unit_psu_voltage_' + gpuIdx, 'voltage (V) (unit {0})'.format(i), 'absolute' ]] } return True
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def systemGetDriverVersion(): return pynvml.nvmlSystemGetDriverVersion()
sys.path.append(pypath) from pynvml import ( nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetName, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, ) nvmlInit() print("Driver Version: %s" % nvmlSystemGetDriverVersion()) deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) print("Device %s: %s" % (i, nvmlDeviceGetName(handle))) memory_info = nvmlDeviceGetMemoryInfo(handle) print("Device %s: Total memory: %s" % (i, memory_info.total / 1024 / 1024)) print("Device %s: Free memory: %s" % (i, memory_info.free / 1024 / 1024)) print("Device %s: Used memory: %s" % (i, memory_info.used / 1024 / 1024)) util = nvmlDeviceGetUtilizationRates(handle) print("Device %s: GPU Utilization: %s%%" % (i, util.gpu)) print("Device %s: Memory Utilization: %s%%" % (i, util.memory))