def show_process(header, processes): print(header) processes = sorted(processes, key=lambda x: toMiB(x.usedGpuMemory), reverse=True) buf = defaultdict() for p in processes: p_pid = p.pid p_name = nvmlSystemGetProcessName(p_pid).decode() p_name = p_name.split(' ')[0].split('/')[-1] p_mem = toMiB(p.usedGpuMemory) buf[p_name] = p_mem message = [f"{k}: [{v}MiB]" for k, v in buf.items()] print('\n'.join(message))
def get_gpu_pid_info(): """Retrieves the process IDs of processes running on the GPU.""" gpus = [] device_count = -1 try: nvmlInit() device_count = nvmlDeviceGetCount() gpus = [{}] * device_count for i in range(device_count): gpus[i] = {'id': i} handle = nvmlDeviceGetHandleByIndex(i) device_name = nvmlDeviceGetName(handle) gpus[i]['name'] = device_name try: util = nvmlDeviceGetUtilizationRates(handle) gpus[i]['utilization'] = util.gpu except NVMLError as err: print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr) try: mem_info = nvmlDeviceGetMemoryInfo(handle) gpus[i]['mem_total'] = mem_info.total gpus[i]['mem_used'] = mem_info.used except NVMLError as err: print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr) try: fan_speed = nvmlDeviceGetFanSpeed(handle) gpus[i]['fan_speed'] = fan_speed except NVMLError as err: print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr) try: temp = nvmlDeviceGetTemperature(handle, 0) gpus[i]['temp'] = temp except NVMLError as err: print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr) try: power_usage = nvmlDeviceGetPowerUsage(handle) gpus[i]['power_usage'] = round(power_usage / 1000.) except NVMLError as err: print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr) try: power_limit = nvmlDeviceGetEnforcedPowerLimit(handle) gpus[i]['power_limit'] = round(power_limit / 1000.) except NVMLError as err: print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr) gpus[i]['processes'] = [] try: processes = nvmlDeviceGetComputeRunningProcesses(handle) for process in processes: process_name = nvmlSystemGetProcessName(process.pid).decode() gpus[i]['processes'].append({'pid': process.pid, 'name': process_name}) except NVMLError as err: print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr) except NVMLError as err: print(f'Error while reading GPU information: {err}', file=sys.stderr) nvmlShutdown() return gpus, device_count
def check(self, instance): pynvml.nvmlInit() msg_list = [] gpus_in_use = 0 try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) gpus_in_use += 1 if util_rate.memory > 50.0 else 0 except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) self.gauge('nvml.process.count', len(cps), d_tags) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags['puser'] = self.get_process_owner(ps.pid) docker_name, docker_image = self.get_container_name(ps.pid) p_tags['docker_image'] = docker_image p_tags['docker_name'] = docker_name p_tags = self._dict2list(p_tags) print p_tags self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) self.gauge('nvml.gpus_in_use_count', gpus_in_use) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def test_nvmlSystemGetProcessName(nvml): procname = None procname = pynvml.nvmlSystemGetProcessName(os.getpid()) print("[Process: " + str(procname.decode()) + "]", end=" ") assert procname != None
def test_nvmlSystemGetProcessName(nvml): procname = None procname = pynvml.nvmlSystemGetProcessName(os.getpid()) print('[Process: ' + str(procname.decode()) + ']', end=' ') assert procname != None
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
card_temp = ' ' + str( pml.nvmlDeviceGetTemperature(handle, pml.NVML_TEMPERATURE_GPU)) + 'C' show_str_lst.append(card_temp) # 利用率 card_util_ratio = ' {:>3}'.format( pml.nvmlDeviceGetUtilizationRates(handle).gpu) + '%' show_str_lst.append(card_util_ratio) # 进程占用情况 p_str = '' procs = pml.nvmlDeviceGetComputeRunningProcesses(handle) for j, p in enumerate(procs): #pid = ' ' + str(p.pid) + ' ' pid = '{:<7}'.format(p.pid) p_name = bytes.decode(pml.nvmlSystemGetProcessName(p.pid)) p_name = ' {:<10} '.format(p_name) p_mem_used = ' ' + str(p.usedGpuMemory // mega) + 'M' pc = psutil.Process(procs[0].pid) p_user = '******'.format(pc.username()) p_str = ' ' + pid + p_name + p_user + p_mem_used if j == 0: show_str_lst.append(p_str) else: show_str_lst.append('\n' + ' ' * 31 + p_str) t_t = ' '.join(show_str_lst) if i == 0:
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 # Number of active GPUs self.gauge('nvml.gpus.number', deviceCount) for device_id in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % int(util_encoder[0])) self.gauge('nvml.util.encoder', int( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % int(util_decoder[0])) self.gauge('nvml.util.decoder', int( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) # Clocks throttling info # Divide by the mask so that the value is either 0 or 1 per GPU try: throttle_reasons = ( pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)) self.gauge('nvml.throttle.appsettings', (throttle_reasons & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) / pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, tags=d_tags) self.gauge('nvml.throttle.display', (throttle_reasons & GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) / GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS, tags=d_tags) self.gauge('nvml.throttle.hardware', (throttle_reasons & pynvml.nvmlClocksThrottleReasonHwSlowdown) / pynvml.nvmlClocksThrottleReasonHwSlowdown, tags=d_tags) self.gauge('nvml.throttle.idle', (throttle_reasons & pynvml.nvmlClocksThrottleReasonGpuIdle) / pynvml.nvmlClocksThrottleReasonGpuIdle, tags=d_tags) self.gauge('nvml.throttle.power.hardware', (throttle_reasons & GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) / GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.power.software', (throttle_reasons & pynvml.nvmlClocksThrottleReasonSwPowerCap) / pynvml.nvmlClocksThrottleReasonSwPowerCap, tags=d_tags) self.gauge('nvml.throttle.syncboost', (throttle_reasons & GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST, tags=d_tags) self.gauge('nvml.throttle.temp.hardware', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.temp.software', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE, tags=d_tags) self.gauge('nvml.throttle.unknown', (throttle_reasons & pynvml.nvmlClocksThrottleReasonUnknown) / pynvml.nvmlClocksThrottleReasonUnknown, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = ','.join(msg_list) else: status = AgentCheck.OK msg = 'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
pid, p.memory_full_info().pss / 1024. / 1024. / 1024.)) #################查看进程资源情况################################################ #################查看GPU资源情况################################################# pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() for i in range(deviceCount): # 这里的i是GPU id handle = pynvml.nvmlDeviceGetHandleByIndex(i) print("GPU %d name %s" % (i, pynvml.nvmlDeviceGetName(handle))) print("GPU %d Driver %s" % (i, pynvml.nvmlSystemGetDriverVersion())) # 显示驱动信息 meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) print("GPU %d mem info total : %.3f GByte" % (i, meminfo.total / 1024. / 1024. / 1024.)) print("GPU %d mem info used : %.3f MByte" % (i, meminfo.used / 1024. / 1024.)) print("GPU %d mem info free : %.3f MByte" % (i, meminfo.free / 1024. / 1024.)) print("Temperature is %d℃" % pynvml.nvmlDeviceGetTemperature(handle, 0)) print("Fan speed is %d%%" % pynvml.nvmlDeviceGetFanSpeed(handle)) print("Power ststus P%d" % pynvml.nvmlDeviceGetPowerState(handle)) print("Power ststus %.1fW" % (pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0)) print("nvmlSystemGetProcessName is %s" % pynvml.nvmlSystemGetProcessName(pid)) # /usr/bin/python # print("nvmlSystemGetProcessName is %s" % pynvml.nvmlDeviceGetAccountingStats(handle, pid)) # 最后要关闭管理工具 pynvml.nvmlShutdown() #################查看GPU资源情况#################################################
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)