def collect_metrics(self): """ Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.) """ data_list = [] for gpu_num in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(gpu_num) device_name = DEVICE_NAME_FORMAT % gpu_num power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0 fan_speed = nvmlDeviceGetFanSpeed(handle) temperature = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) data_list.append({ 'measurement': device_name, 'tags': { 'host': 'minar', 'gpu': device_name }, 'fields': { 'power_usage': power_usage, 'fan_speed': fan_speed, 'temperature': temperature } }) time.sleep(PERIOD_SECS) return data_list
def power_usage(self): """ Power usage for the device in milliwatts From the NVIDIA documentation: - On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. """ return nv.nvmlDeviceGetPowerUsage(self._handle)
def get(index): try: handle = pynvml.nvmlDeviceGetHandleByIndex(index) except pynvml.NVMLError_GpuIsLost: return None memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) return dict( nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'), nvmlDeviceGetMemoryInfo=dict( total=memory_info.total, free=memory_info.free, used=memory_info.used, ), nvmlDeviceGetUtilizationRates=get_utilization_rates(handle), nvmlDeviceGetFanSpeed=get_fan_speed(handle), nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU), nvmlDeviceGetTemperatureThreshold=dict( slowdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN), shutdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN), ), nvmlDeviceGetPowerManagementLimit=pynvml. nvmlDeviceGetPowerManagementLimit(handle), nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle), )
def _get_gpu_usage(gpu_count): import pynvml gpus = [] for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) try: power_usage = ( pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) * 100 except pynvml.NVMLError as e: logger.error( "Coudln't extract power usage due to NVML exception: {}". format(str(e))) power_usage = -9999 gpus.append( (handle, util.gpu, util.memory, (memory.used / float(memory.total)) * 100, temp, power_usage)) except pynvml.NVMLError as e: logger.error( "Coudln't extract gpu usage information due to NVML exception: {}" .format(str(e))) return None return gpus
def get_mean_power(cls, engine_id=None): # Returns mean power used by all the GPUs # at the moment the function is called # Units are Watts power = 0 pynvml.nvmlInit() if engine_id: handle = pynvml.nvmlDeviceGetHandleByIndex(engine_id) power = (pynvml.nvmlDeviceGetPowerUsage(handle) / 1000) else: for i in range(0, pynvml.nvmlDeviceGetCount()): handle = pynvml.nvmlDeviceGetHandleByIndex(i) power = power + (pynvml.nvmlDeviceGetPowerUsage(handle) / 1000) power = round(power / pynvml.nvmlDeviceGetCount()) return power
def getPowerDraw(handle): try: powDraw = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 powDrawStr = '%.2f' % powDraw except pynvml.NVMLError as err: powDrawStr = handleError(err) PUSH_TO_CW = FalseG return powDrawStr
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': { 'gpu': util_gpu, 'memory': util_mem }, 'memory': { 'total': mem_total, 'free': mem_free, 'used': mem_used }, 'temperature': temperature, 'power': { 'draw': power_draw, 'limit': power_limit } } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def stats(self): stats = {} for i in range(0, self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.{}.{}".format(i, "memory")] = util.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.process.{}.{}".format(i, "memory")] = util.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage( handle) / 1000.0 power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit( handle) / 1000.0 power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format( i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format( i, "powerPercent")] = power_usage except pynvml.NVMLError as err: pass except pynvml.NVMLError as err: pass if psutil: #net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent return stats
def get_gpu_status(gpu_index=0): # init for getting N.nvmlInit() handle = N.nvmlDeviceGetHandleByIndex(gpu_index) def _decode(b): if isinstance(b, bytes): return b.decode() # to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None try: memory = N.nvmlDeviceGetMemoryInfo(handle) except N.NVMLError: memory = None try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None # real gpu index index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature': temperature, 'utilization': utilization.gpu if utilization else None, 'power': int(power / 1000) if power is not None else None, 'enforced.power': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, } # release resource N.nvmlShutdown() return GPUStat(gpu_info)
def gpu_info(gpu_handle, i: int = 0) -> List[Dict[str, Any]]: power = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000 temperature = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU) free_memory = best_prefix(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).free) return [ dict(full_text=f'GPU Power {power:.1f} W', name=f'gpu{i}_power'), dict(full_text=free_memory.format('GPU RAM {value:.1f} {unit}'), name=f'gpu{i}_free_memory'), dict(full_text=f'GPU Temp {temperature} ℃', name=f'gpu{i}_temperature'), ]
def get_pwr(self): if self.nvh == None: return None pwr = None try: pwr = nv.nvmlDeviceGetPowerUsage(self.nvh) / 1000 self.pwr = pwr except: logger.error( f"{self.pci_dev.slot_name}/{self.name}] get pwr failed !!") return pwr
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def get_power(handle): power_usage = -1 power_max = -1 power_percent = -1 try: # defaults to milliwatts power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_usage = power_usage / 1000. # defaults to milliwatts power_max = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) power_max = power_max / 1000 power_percent = (float(power_usage) / power_max) * 100. except Exception: pass return power_usage, power_max, power_percent
def get_power(handle): power_usage = -1 power_max = -1 power_percent = -1 try: # defaults to milliwatts power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_usage = power_usage / 1000. # defaults to milliwatts power_max = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) power_max = power_max / 1000 power_percent = (float(power_usage) / power_max) * 100. except Exception: pass return power_usage, power_max, power_percent
def power_usage(self): """Retrieves instantaneous power usages (W) of all GPUs in a list. Note: Requires NVML to be initialized. """ gpu_power_usages = [] for handle in self._handles: try: # Retrieves power usage in mW, divide by 1000 to get in W. power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000 gpu_power_usages.append(power_usage) except pynvml.NVMLError: pass return gpu_power_usages
def get_perf(proc=None, recursive=True, children_pool=None, metrics=None): """ Get process performance metrics """ _initialize_pynvml() if metrics is None: metrics = OrderedDict() metrics['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S') metrics['cpu_total'] = psutil.cpu_count() metrics['cpu_used'] = psutil.cpu_percent() if proc: if recursive: percents = [] _recursive_proc(percents, proc, children_pool, lambda p: p.cpu_percent()) metrics['proc_count'] = len(percents) metrics['cpu_used_proc'] = sum(percents) else: metrics['cpu_used_proc'] = proc.cpu_percent() mem = psutil.virtual_memory() metrics['mem_total'] = mem.total metrics['mem_used'] = mem.used if proc: if recursive: rss = [] _recursive_proc(rss, proc, children_pool, lambda p: p.memory_info().rss) metrics['mem_used_proc'] = sum(rss) else: metrics['mem_used_proc'] = proc.memory_info().rss for i, h in enumerate(_gpu_devices): used = pynvml.nvmlDeviceGetUtilizationRates(h) mem = pynvml.nvmlDeviceGetMemoryInfo(h) metrics[f'gpu_{i}_used'] = used.gpu metrics[f'gpu_{i}_mem_used'] = mem.used # used.memory metrics[f'gpu_{i}_mem_total'] = mem.total metrics[f'gpu_{i}_power_used'] = pynvml.nvmlDeviceGetPowerUsage(h) metrics[ f'gpu_{i}_power_total'] = pynvml.nvmlDeviceGetPowerManagementLimit( h) return metrics
def measureEnergy(runningThread, delay): powers = [] pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(1) # while(~runningThread.is_alive()): i = 0 while (i < 10): powers.append(pynvml.nvmlDeviceGetPowerUsage(handle)) time.sleep(delay) i += 1 pynvml.nvmlShutdown() energy = np.sum(powers) * delay print(powers) print(energy) return energy
def get_gpu_stat(handle): ret = {} # get temperature try: ret['temp'] = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except: ret['temp'] = None # get power usage try: ret['watt'] = N.nvmlDeviceGetPowerUsage(handle) / 1000 except: ret['watt'] = None ret['fan'] = 0 # return information gathered #print("temp: {0}, watt: {1}".format(ret['temp'], ret['watt'])) return ret
def monitoring_task(inference_thread): """ This function defines the background action, i.e. monitoring the GPU memory and power usage for a given number of batches. For now it only listens to the GPU indexed 0. I will have to make it more flexible. """ nvmlInit() handle = nvmlDeviceGetHandleByIndex(3) while True: try: power = float(nvmlDeviceGetPowerUsage(handle)) / 1000 # convert to W power_list.append(power) except: pass time.sleep(0.02) if not inference_thread.is_alive(): nvmlShutdown() break
def _watch_power(logfile: Path = None, sender: Connection = None, display: bool = False): """ Poll GPU and log/display current power consumption. Update frequency: every 1 second. :param logfile: logfile path (the file will be created/overwritten). :param sender: sender-end connection. :param display: display consumption in terminal. :return: None """ total = 0 killer = _GracefulKiller() nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) if logfile is not None: f = open(logfile, 'w') while not killer.kill_now: # exit gracefully power = int(nvmlDeviceGetPowerUsage( handle)) / 1000 # strangely nvidia outputs milliwatts total += power / 3600 / 1000 # convert to kWh if display: print( f'\r{datetime.now().strftime("%H:%M:%S")} {total:.5f} kWh so far', end='') if logfile is not None: f.write(f'{datetime.now()} {power}\n') time.sleep(1) print(total) if display: print('', end='\n') if sender is not None: sender.send(total)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
loss = loss0_bce loss.backward() import pynvml import os import time train_list = get_data_list( "/home/ubuntu/liuyiyao/3D_breast_Seg/Dataset/miccai_data_64*256*256_patch", ratio=0.8) pynvml.nvmlInit() devicen_num = pynvml.nvmlDeviceGetCount() while True: for i in range(devicen_num): handle = pynvml.nvmlDeviceGetHandleByIndex(i) power_state = pynvml.nvmlDeviceGetPowerUsage(handle) power_perf = pynvml.nvmlDeviceGetPerformanceState(handle) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) pids = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) print("gpu:", i, ' mem_used:', meminfo.used / 1024 / 1024, ' power:', power_state / 1000, ' perf:', power_perf) if meminfo.used / 1024 / 1024 < 5000 and power_perf == 8: print("start1") os.environ["CUDA_VISIBLE_DEVICES"] = "%d" % i print("start2") device_ids = [0] print("start ", device_ids[0]) k_fold(5, train_list, device_ids)
def _get_full_status_nvml(): devices_status = [] devices_full_status = [] for handle in _static_info['private']['gpu']['handles']: util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) devices_status.append({ 'utilization': { 'gpu': util.gpu, 'memory': util.memory }, 'memory': { 'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0 }, 'processes': len(process_info) }) with _process_info_lock: process_list = [] for p in process_info: info = _process_info[p.pid] info['gpu_memory'] = p.usedGpuMemory process_list.append(info) process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True) full_status = { 'memory': { 'free': mem_info.free, 'used': mem_info.used }, 'process_list': process_list } try: full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported: pass try: full_status['temperature'] = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except pynvml.NVMLError_NotSupported: pass try: full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState( handle) except pynvml.NVMLError_NotSupported: pass try: full_status['power'] = { 'usage': pynvml.nvmlDeviceGetPowerUsage(handle), 'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle) } except pynvml.NVMLError_NotSupported: pass devices_full_status.append(full_status) status = { 'basic': { 'devices': devices_status }, 'full': { 'devices': devices_full_status } } return status
def get_power_usage(handle): """ Returns power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87 """ return pynvml.nvmlDeviceGetPowerUsage(handle)
def test_nvmlDeviceGetPowerUsage(ngpus, handles): for i in range(ngpus): power_mWatts = pynvml.nvmlDeviceGetPowerUsage(handles[i]) assert power_mWatts >= 0.0
def stats(self): stats = {} for i in range(0, self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.{}.{}".format(i, "memory")] = util.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.process.{}.{}".format(i, "memory")] = util.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0 power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format(i, "powerPercent")] = power_usage except pynvml.NVMLError as err: pass except pynvml.NVMLError as err: pass if psutil: net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent stats["network"] = { "sent": net.bytes_sent - self.network_init["sent"], "recv": net.bytes_recv - self.network_init["recv"] } # TODO: maybe show other partitions, will likely need user to configure stats["disk"] = psutil.disk_usage('/').percent stats["proc.memory.availableMB"] = sysmem.available / 1048576.0 try: stats["proc.memory.rssMB"] = self.proc.memory_info().rss / \ 1048576.0 stats["proc.memory.percent"] = self.proc.memory_percent() stats["proc.cpu.threads"] = self.proc.num_threads() except psutil.NoSuchProcess: pass return stats
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_infos(): """Get all information about all your graphics cards. Returns: dict: The returned result is a dict with 3 keys: count, driver_version and devices: count: Number of gpus found driver_version: The version of the system’s graphics driver devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. It should be noted that the Process field is also a namedtuple which has 11 fields. """ infos = {} Device = namedtuple( "Device", [ "id", "name", "free", "used", "total", "temperature", "fan_speed", "power_usage", "power_state", "process", ], ) Process = namedtuple( "Process", [ "pid", "memory_percent", "status", "username", "num_threads", "cpu_num", "cpu_percent", "name", "cmdline", "used_gpu_mem", "create_time", ], ) driver_version = pynvml.nvmlSystemGetDriverVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power_usage = pynvml.nvmlDeviceGetPowerUsage( handle) # Power usage in milliwatts mW processes = pynvml.nvmlDeviceGetComputeRunningProcesses( handle) # Which processes are using the GPU # process_info = [(item.pid, item.usedGpuMemory) for item in process_info] process_info = [] for p in processes: # append Process object to process_info pid = p.pid used_gpu_mem = p.usedGpuMemory p = psutil.Process(pid=pid) _ = p.cpu_percent() time.sleep(0.05) process_info.append( Process( pid=pid, memory_percent=p.memory_percent(), status=p.status(), username=p.username(), num_threads=p.num_threads(), cpu_num=p.cpu_num(), cpu_percent=p.cpu_percent(), name=p.name(), cmdline=" ".join(p.cmdline()), used_gpu_mem=used_gpu_mem, create_time=p.create_time(), )) try: fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported as e: fan_speed = None power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_state = pynvml.nvmlDeviceGetPowerState(handle) temperature = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) devices.append( Device( id=i, name=name, free=mem_info.free, used=mem_info.used, total=mem_info.total, temperature=temperature, fan_speed=fan_speed, power_usage=power_usage, power_state=power_state, process=process_info, )) infos["count"] = device_count infos["driver_version"] = driver_version infos["devices"] = devices return infos
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_last_used(index): last_useds = [] if not os.path.exists('gpu_history.pkl'): pickle.dump({}, open('gpu_history.pkl', 'wb')) with open('gpu_history.pkl', 'rb') as f: history = pickle.load(f) if platform.node() in history: for user, last_used in history[ platform.node()][index].items(): # 1 day = 24 hours, 1 hour = 3600 seconds used_before = (datetime.now() - last_used['last_used']).days * 24 + \ (datetime.now() - last_used['last_used']).seconds / 3600 last_useds.append((user, used_before)) return last_useds else: return [] def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) last_used = get_last_used(index) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, 'last_used': last_used, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_gpu_pid_info(): """Retrieves the process IDs of processes running on the GPU.""" gpus = [] device_count = -1 try: nvmlInit() device_count = nvmlDeviceGetCount() gpus = [{}] * device_count for i in range(device_count): gpus[i] = {'id': i} handle = nvmlDeviceGetHandleByIndex(i) device_name = nvmlDeviceGetName(handle) gpus[i]['name'] = device_name try: util = nvmlDeviceGetUtilizationRates(handle) gpus[i]['utilization'] = util.gpu except NVMLError as err: print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr) try: mem_info = nvmlDeviceGetMemoryInfo(handle) gpus[i]['mem_total'] = mem_info.total gpus[i]['mem_used'] = mem_info.used except NVMLError as err: print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr) try: fan_speed = nvmlDeviceGetFanSpeed(handle) gpus[i]['fan_speed'] = fan_speed except NVMLError as err: print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr) try: temp = nvmlDeviceGetTemperature(handle, 0) gpus[i]['temp'] = temp except NVMLError as err: print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr) try: power_usage = nvmlDeviceGetPowerUsage(handle) gpus[i]['power_usage'] = round(power_usage / 1000.) except NVMLError as err: print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr) try: power_limit = nvmlDeviceGetEnforcedPowerLimit(handle) gpus[i]['power_limit'] = round(power_limit / 1000.) except NVMLError as err: print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr) gpus[i]['processes'] = [] try: processes = nvmlDeviceGetComputeRunningProcesses(handle) for process in processes: process_name = nvmlSystemGetProcessName(process.pid).decode() gpus[i]['processes'].append({'pid': process.pid, 'name': process_name}) except NVMLError as err: print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr) except NVMLError as err: print(f'Error while reading GPU information: {err}', file=sys.stderr) nvmlShutdown() return gpus, device_count
def power_for(device_handle): try: return pynvml.nvmlDeviceGetPowerUsage(device_handle) except pynvml.NVMLError: return None
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info