def measure_gpu_usage(self): from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \ nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError max_gpu_usage = [] gpu_name = [] try: nvmlInit() deviceCount = nvmlDeviceGetCount() max_gpu_usage = [0 for i in range(deviceCount)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount) ] while True: for i in range(deviceCount): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i] } for i in range(deviceCount)] except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None
def __init__(self, handle, cpu_to_node): node = None # TODO: use number of CPU cores to determine cpuset size # This is very hacky at the moment affinity = pynvml.nvmlDeviceGetCpuAffinity(handle, 1) n_cpus = max(cpu_to_node.keys()) + 1 for j in range(n_cpus): if affinity[0] & (1 << j): cur_node = cpu_to_node[j] if node is not None and node != cur_node: node = -1 # Sentinel to indicate unknown affinity else: node = cur_node if node == -1: node = None self.node = node self.mem = pynvml.nvmlDeviceGetMemoryInfo(handle).total self.name = pynvml.nvmlDeviceGetName(handle) # NVML doesn't report compute capability, so we need CUDA pci_bus_id = pynvml.nvmlDeviceGetPciInfo(handle).busId # In Python 3 pci_bus_id is bytes but pycuda wants str if not isinstance(pci_bus_id, str): pci_bus_id = pci_bus_id.decode('ascii') cuda_device = pycuda.driver.Device(pci_bus_id) self.compute_capability = cuda_device.compute_capability() self.device_attributes = {} self.uuid = pynvml.nvmlDeviceGetUUID(handle) for key, value in cuda_device.get_attributes().items(): if isinstance(value, (int, float, str)): # Some of the attributes use Boost.Python's enum, which is # derived from int but which leads to invalid JSON when passed # to json.dumps. if isinstance(value, int) and type(value) != int: value = str(value) self.device_attributes[str(key)] = value
def get_gpu_info_by_nvml(self) -> Dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None try: nvmlInit() driver_version = nvmlSystemGetDriverVersion() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) gpu_info = {} gpu_info["memory_total"] = info.total gpu_info["memory_available"] = info.free gpu_info["name"] = nvmlDeviceGetName(handle) gpu_info_list.append(gpu_info) nvmlShutdown() except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None result = {"driver_version": driver_version, "devices": gpu_info_list} if 'CUDA_VISIBLE_DEVICES' in environ: result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES'] return result
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework if self.framework == "PyTorch": info["use_torchscript"] = self.args.torchscript if self.framework == "TensorFlow": info["eager_mode"] = self.args.eager_mode info["use_xla"] = self.args.use_xla info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self.args.fp16 info["use_multiprocessing"] = self.args.do_multi_processing info["only_pretrain_model"] = self.args.only_pretrain_model if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory. " "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self.args.is_gpu if self.args.is_gpu: info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" info["use_tpu"] = self.args.is_tpu # TODO(PVP): See if we can add more information about TPU # see: https://github.com/pytorch/xla/issues/2180 self._environment_info = info return self._environment_info
def __init__(self, index: int): self.index = index self.handle = py3nvml.nvmlDeviceGetHandleByIndex(index) self.name = py3nvml.nvmlDeviceGetName(self.handle) self.memory = Memory(self.handle) self.utilization = Utilization(self.handle) self.processes = Processes(self.handle) self.update()
def __init__(self, report=None, devices=None, quiet=False, always_suffix=False, output=print, verbose_once=True): super(self.__class__, self).__init__() global nvml self.output = output if nvml is not None: try: nvml.nvmlInit() except (OSError, nvml.NVMLError_LibraryNotFound): # the python library might be installed, but not the drivers... nvml = None if nvml is None: if not quiet: self.output( "Could not load py3nvml, cannot report any nvidia device statistics." ) report = [] else: device_count = nvml.nvmlDeviceGetCount() if devices is None: devices = list(range(device_count)) else: devices = [ int(device) for device in devices if 0 <= int(device) < device_count ] self.devices = devices self.deviceHandles = [ nvml.nvmlDeviceGetHandleByIndex(device) for device in devices ] if not quiet: for n, handle in enumerate(self.deviceHandles): self.output("Collecting statistics for device #% 2d: %s" % (n, nvml.nvmlDeviceGetName(handle))) if report is None: report = ['temperature', 'utilization_gpu'] elif report == 'all': report = list(self.reportable_values.keys()) self.verbose_once = verbose_once self.report = report self.always_suffix = always_suffix
def gpu_info(): "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)" nvmlInit() deviceCount = nvmlDeviceGetCount() info = [] for i in range(0, deviceCount): handle = nvmlDeviceGetHandleByIndex(i) util = nvmlDeviceGetUtilizationRates(handle) desc = nvmlDeviceGetName(handle) info.append( (i, desc, util.gpu)) #['GPU %i - %s' % (i, desc)] = util.gpu return info
def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) max_gpu_usage = [] gpu_name = [] try: nvmlInit() device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): logger.error( f"nvmlDeviceGetCount result is not integer: {device_count}" ) return None max_gpu_usage = [0 for i in range(device_count)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(device_count) ] while True: for i in range(device_count): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): logger.error( f"nvmlDeviceGetMemoryInfo returns str: {info}") return None max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i], } for i in range(device_count)] except NVMLError as error: logger.error("Error fetching GPU information using nvml: %s", error) return None
def environment_info(self): if self._environment_info is None: info = {} info["gluonnlp_version"] = gluonnlp.__version__ info["framework_version"] = mxnet.__version__ info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self._use_fp16 if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self._use_gpu if self._use_gpu: info["num_gpus"] = 1 if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState( handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" self._environment_info = info return self._environment_info
def __init__(self): self.labels = ['gpu', 'name', 'driver'] self.driver = nv.nvmlSystemGetDriverVersion() self.n_gpu = nv.nvmlDeviceGetCount() self.hnds = [ nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu) ] self.args = [] for i, hnd in enumerate(self.hnds): args = OrderedDict() args['gpu'] = 'gpu%d' % i args['name'] = nv.nvmlDeviceGetName(hnd) args['driver'] = self.driver self.args.append(args)
def gpu_info(gpu_index: int) -> Tuple[str, int]: """Returns a description of a GPU Returns the description and memory size of GPU. """ nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(gpu_index) gpu_desc = nvml.nvmlDeviceGetName(handle) # # Get memory info. # mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) if mem_info != 'N/A': mem_total = mem_info.total >> 20 else: mem_total = 0 return gpu_desc, mem_total
def get_gpu_info() -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) try: nvmlInit() result = [] device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): return None for i in range(device_count): info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): return None result.append({ "id": i, "name": nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)), "total": info.total, "free": info.free, "used": info.used, }) nvmlShutdown() return result except NVMLError as error: print("Error fetching GPU information using nvml: %s", error) return None
def on_epoch_end(self, epoch, logs=None): for item in self.report: try: suffix = handle = None for n, handle in enumerate(self.deviceHandles): if len(self.deviceHandles) == 1 and not self.always_suffix: suffix = '' else: suffix = '_%02d' % ( n, ) # TODO: this will not work nicely if more than 100 GPUs are in one sys logs[item + suffix] = np.float32( self.reportable_values[item](handle)) except nvml.NVMLError as err: self.output("Error trying to read out value from NVML: %r" % (err, )) if self.report and self.verbose_once: self.output("Current status for device #% 2d (%s): %r" % (n, nvml.nvmlDeviceGetName(handle), { what: float(call(handle)) for what, call in self.reportable_values.items() })) self.verbose_once = False # only print once
def live(self, callback): try: with Live( self.layout, refresh_per_second=2, screen=False, redirect_stderr=False, redirect_stdout=False, ) as live: while True: if self.stop_flag: break if callback: callback(self) if not self.resources_by_endpoint: self.layout["endpoints"].update( Panel( Align.center( Text( "Waiting for endpoints to come alive"), vertical="middle", ))) else: self.endpoints_layout["data"].update( EndpointMonitor(self.resources_by_endpoint)) self.endpoints_values = [] updated_keys = set() max_value = 0 for ( endpoint_name, endpoint_data, ) in self.resources_by_endpoint.items(): updated_keys.add(endpoint_name) # todo: finish total_used = 0 total_max = 0 for entry in endpoint_data.values(): total_used += entry.used total_max += entry.available max_value = max(max_value, total_max) # self.endpoints_values.append(data) past_entries = self.endpoints_past_values.setdefault( endpoint_name, []) past_entries.append(total_used) self.endpoints_values.append(past_entries) self.endpoints_graph = AsciiGraph( self.endpoints_values, max_value, BACKEND_COLORS) self.endpoints_layout["graph"].update( self.endpoints_graph) self.layout["endpoints"].update( Panel(self.endpoints_layout, title="Endpoints")) uptime = datetime.datetime.now() - self.start_time self.layout["header"]["info"].update( Align.right( Text(f"""Node ID: {self.node_id} Uptime: {humanize.naturaldelta(uptime)} https://discord.gg/94KqBcE"""), vertical="middle", )) titles = [] table = Table.grid() table.add_column(style="green") table.add_column(no_wrap=True) self.cpu_usage[0].append(psutil.cpu_percent(interval=None)) self.ram_usage[0].append( int(round(psutil.virtual_memory().used / 1024**2))) total_gpus_actual = py3nvml.nvmlDeviceGetCount() for i in range(total_gpus_actual): handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) utilization_info = py3nvml.nvmlDeviceGetUtilizationRates( handle) table.add_row( py3nvml.nvmlDeviceGetName(handle), str(round(meminfo.used / 1024**2)), ) self.gpu_mem_usage[i].append( round(meminfo.used / 1024**2)) self.gpu_usage[i].append(utilization_info.gpu) color = RICH_COLORS[i] titles.append( f"[{color}]" + py3nvml.nvmlDeviceGetName(handle) + f" {utilization_info.gpu}%, {humanize.naturalsize(meminfo.used)}/{humanize.naturalsize(meminfo.total)}" + "[/]") self.gpu_layout["utilization"].update( self.gpu_usage_graph, ) self.gpu_layout["memory"].update(self.gpu_mem_usage_graph) self.layout["gpu"].update( Panel(self.gpu_layout, title=" ".join(titles))) self.cpu_layout["utilization"].update( Panel(self.cpu_usage_graph)) self.cpu_layout["memory"].update( Panel(self.ram_usage_graph)) self.cpu_layout["utilization"].update( self.cpu_usage_graph, ) self.cpu_layout["memory"].update(self.ram_usage_graph) self.layout["cpu"].update( Panel(self.cpu_layout, title=CPU_NAME)) self.layout["console"].update(self.tail) sleep(1.0) except KeyboardInterrupt as e: py3nvml.nvmlShutdown() raise e
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" else: info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) info["use_gpu"] = self.is_gpu if self.is_gpu: info["num_gpus"] = self.args.n_gpu try: from py3nvml import py3nvml py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" except (OSError, py3nvml.NVMLError): logger.warning( "Error while initializing comunication with GPU. " "We won't log information about GPU.") info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" py3nvml.nvmlShutdown() else: info["gpu"] = py3nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( py3nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState( handle) py3nvml.nvmlShutdown() self._environment_info = info return self._environment_info
def getGpuInfo(self): if (self._impulse % 2) != 0: return self._gpuInfoObj try: N.nvmlInit() gpuInfoObj = {} driverVersion = N.nvmlSystemGetDriverVersion() deviceCnt = N.nvmlDeviceGetCount() gpuInfoObj['DRIVER_VERSION'] = driverVersion gpuInfoObj['DEVICE_COUNT'] = deviceCnt for dCnt in range(deviceCnt): deviceInfoObj = {} handle = N.nvmlDeviceGetHandleByIndex(dCnt) name = N.nvmlDeviceGetName(handle) try: fan = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as err: fan = 'N/A' try: temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError as err: temp = 'N/A' try: powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000) except N.NVMLError as err: powerUsage = 'N/A' try: powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000) except N.NVMLError as err: powerLimit = 'N/A' try: memInfo = N.nvmlDeviceGetMemoryInfo(handle) memUsage = round(memInfo.used/1024/1024) memTotal = round(memInfo.total/1024/1024) except N.NVMLError as err: memUsage = 'N/A' memTotal = 'N/A' try: util = N.nvmlDeviceGetUtilizationRates(handle).gpu except N.NVMLError as err: util = 'N/A' deviceInfoObj['NAME'] = name deviceInfoObj['FAN'] = fan deviceInfoObj['TEMP'] = temp deviceInfoObj['POWER_USAGE'] = powerUsage deviceInfoObj['POWER_LIMIT'] = powerLimit deviceInfoObj['MEM_USAGE'] = memUsage deviceInfoObj['MEM_TOTAL'] = memTotal deviceInfoObj['UTIL'] = util gpuProcessObj = {} try: processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError as err: processes = [] for pCnt, process in enumerate(processes): gpuMem = round(process.usedGpuMemory / 1024 / 1024) pid = process.pid try: p = psutil.Process(pid) attrs = p.as_dict(attrs = ['name', 'username', 'status']) except psutil.ZombieProcess: attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'} except: pass gpuProcessObj[str(pCnt)] = { 'PID': pid, 'MEM': gpuMem, 'NAME': attrs['name'], 'USERNAME': self._getSubuidName(attrs['username']), 'STATUS': attrs['status'] } deviceInfoObj['PROCESS'] = gpuProcessObj gpuInfoObj[str(dCnt)] = deviceInfoObj N.nvmlShutdown() except N.NVMLError as err: N.nvmlShutdown() print(err) gpuInfoObj = {} self._gpuInfoObj = gpuInfoObj return gpuInfoObj
def get_device_name(device_handle): """Get GPU device name.""" try: return nativestr(pynvml.nvmlDeviceGetName(device_handle)) except pynvml.NVMlError: return "NVIDIA"
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
#!/usr/bin/env python3 # need package: py3nvml # if you use python 2, you need nvidia-ml-py and change the import from __future__ import print_function # import pynvml import py3nvml.py3nvml as pynvml import datetime pynvml.nvmlInit() print("Driver Version:", pynvml.nvmlSystemGetDriverVersion()) deviceCount = pynvml.nvmlDeviceGetCount() for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) print("Device {}: {}".format(i, pynvml.nvmlDeviceGetName(handle))) pynvml.nvmlShutdown()
def _get_device_name(gpu): return {'name': pynvml.nvmlDeviceGetName(gpu)}