def select_gpu(): """ Finding the gpu number with min used memory. Args: None Returns: GPU number with min used memory (or with max free memory). string """ import pynvml pynvml.nvmlInit() gpu_count = pynvml.nvmlDeviceGetCount() # number of gpu gpu_devices = list(range(gpu_count)) # serial number of gpu devices # Select GPU with min used memory max_memo = 24 * 1024 * 1024 * 1024 gpu_selected = gpu_devices[0] for i in range(len(gpu_devices)): handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_devices[i]) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) if meminfo.used <= max_memo: max_memo = meminfo.used gpu_selected = gpu_devices[i] return str(gpu_selected)
def get_statistics(): """Get statistics for each GPU installed in the system.""" nvmlInit() statistics = [] try: count = nvmlDeviceGetCount() for i in range(count): handle = nvmlDeviceGetHandleByIndex(i) memory = nvmlDeviceGetMemoryInfo(handle) statistics.append({ "gpu": i, "name": nvmlDeviceGetName(handle).decode("utf-8"), "memory": { "total": _convert_kb_to_gb(int(memory.total)), "used": _convert_kb_to_gb(int(memory.used)), "utilisation": int(memory.used / memory.total * 100) }, }) except NVMLError as error: print(error) return statistics
def delay4gpus(delay, gpu_list): if isinstance(delay, bool): if delay: import pynvml import time pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_list[0]) while True: memory = pynvml.nvmlDeviceGetMemoryInfo(handle) usage = memory.used / memory.total if usage < 0.2: break else: print('GPU-%d is in use %.2f, still waiting' % (gpu_list[0], usage)) time.sleep(60) elif isinstance(delay, int) or isinstance(delay, float): import time delay = int(delay) for minute in tqdm(range(delay), desc='Wait:', leave=False, smoothing=0.1): time.sleep(60) else: raise NotImplementedError('Wrong delay type')
def _get_gpu(self, update: Update, context: CallbackContext): print(update.message.from_user.username, "requested gpu usage") pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) info = pynvml.nvmlDeviceGetMemoryInfo(handle) update.message.reply_text(get_usage_msg(info), parse_mode=telegram.ParseMode.MARKDOWN)
def real_time(): return { "utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles], "memory-used": [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles], }
def gpu_info(self): # pip install nvidia-ml-py3 if len(self.gpu_ids) >= 0 and torch.cuda.is_available(): try: import pynvml pynvml.nvmlInit() self.config_dic[ 'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion( ) for gpu_id in self.gpu_ids: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) gpu_id_name = "gpu%s" % gpu_id mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle) self.config_dic['%s_device_name' % gpu_id_name] = pynvml.nvmlDeviceGetName( handle) self.config_dic['%s_mem_total' % gpu_id_name] = gpu_mem_total = round( mem_info.total / 1024**3, 2) self.config_dic['%s_mem_used' % gpu_id_name] = gpu_mem_used = round( mem_info.used / 1024**3, 2) # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2 self.config_dic['%s_mem_percent' % gpu_id_name] = round( (gpu_mem_used / gpu_mem_total) * 100, 1) self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name, gpu_utilize.gpu, 0.8) # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory pynvml.nvmlShutdown() except Exception as e: print(e)
def print_ram_info(self): gpu_total = gpu_free = cpu_free = gc_free = 0 self._ph() try: gc_free = gc.collect() torch.cuda.empty_cache() # @UndefinedVariable val = psutil.virtual_memory()._asdict() cpu_free = round((val["available"] / (1024**3)), 2) self._pp("Free CPU RAM", str(cpu_free) + " GB") # pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) info = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_free = round(info.free / (1024**3), 2) self._pp("Free GPU RAM", str(gpu_free) + " GB") # gpu_total = round(info.total / (1024**3), 2) self._pp("Total GPU RAM", str(gpu_free) + " GB") self._pp("Garbage Collection", gc_free) except: self._pp("**Error", "NO GPU accelerator") self._pp( "Suggest recovery", "Menu > Runtime > Change Runtime Type > {select} GPU accelerator" ) self._ph() return
def autoselect(gpu_target: List[int], min_memory: float) -> int: logging.info(f'GPU search space: {gpu_target}') nvmlInit() deviceCount = nvmlDeviceGetCount() memories = np.zeros((deviceCount, COUNT), dtype=np.float32) rates = np.zeros((deviceCount, COUNT), dtype=np.float32) for c in range(COUNT): for i in range(deviceCount): if i not in gpu_target: memories[i, c] = 0 rates[i, c] = 100 else: handle = nvmlDeviceGetHandleByIndex(i) memories[ i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3 rates[i, c] = int(nvmlDeviceGetUtilizationRates(handle).gpu) time.sleep(INTERVAL) nvmlShutdown() memories = memories.mean(1) rates = rates.mean(1) # enough memory GPU ids memory_enough_ids = np.where(memories > min_memory)[0] if len(memory_enough_ids) > 0: # min util GPU gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])] # if multi GPUs' util are the same, choose one that has the most memory gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0] gpu_min_ids = memory_enough_ids[gpu_min_ids] gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])] logging.info(f'Auto select GPU {gpuid}') else: raise MemoryError(str(memories)) return int(gpuid)
def prepare_net(net, use_gpu=True): handle = None device = 'cpu' if not use_gpu: print('Running on CPUs') return net, device, handle if torch.cuda.is_available(): device = 'cuda' if device != 'cpu': import pynvml import torch.backends.cudnn as cudnn print('Running on GPU') net = net.to(device) # net = torch.nn.DataParallel(net) cudnn.benchmark = True pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) device_name = pynvml.nvmlDeviceGetName(handle).decode("utf-8") print("CUDA Device: {} | RAM: {:.4g}G".format( device_name, mem_info.total / (2**30))) else: print('No CUDA devices available, run on CPUs') return net, device, handle
def gpu_mem_used_get(): "query nvidia for used memory for gpu in MBs (rounded down). If id is not passed, currently selected torch device is used. Clears pytorch cache before taking the measurements" torch.cuda.empty_cache() # clear cache to report the correct data id = torch.cuda.current_device() handle = pynvml.nvmlDeviceGetHandleByIndex(id) info = pynvml.nvmlDeviceGetMemoryInfo(handle) return int(info.used / 2**20)
def get_mem(device_handle): """Get GPU device memory consumption in percent.""" try: memory_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle) return memory_info.used * 100.0 / memory_info.total except pynvml.NVMLError: return None
def get_gpu_memory(gpu_idx): try: handle = nv.nvmlDeviceGetHandleByIndex(gpu_idx) mem = nv.nvmlDeviceGetMemoryInfo(handle) except nv.NVMLError as err: mem = err return mem
def real_time(): init_once() h = _pynvml_handles() return { "utilization": pynvml.nvmlDeviceGetUtilizationRates(h).gpu, "memory-used": pynvml.nvmlDeviceGetMemoryInfo(h).used, }
def get(index): try: handle = pynvml.nvmlDeviceGetHandleByIndex(index) except pynvml.NVMLError_GpuIsLost: return None memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) return dict( nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'), nvmlDeviceGetMemoryInfo=dict( total=memory_info.total, free=memory_info.free, used=memory_info.used, ), nvmlDeviceGetUtilizationRates=get_utilization_rates(handle), nvmlDeviceGetFanSpeed=get_fan_speed(handle), nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU), nvmlDeviceGetTemperatureThreshold=dict( slowdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN), shutdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN), ), nvmlDeviceGetPowerManagementLimit=pynvml. nvmlDeviceGetPowerManagementLimit(handle), nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle), )
def getGPUstate(): """ pip install nvidia-ml-py3 :return:返回一个数组,数组长度为GPU的个数 """ meminfo = {} infoStr = "" try: pynvml.nvmlInit() devicecount = pynvml.nvmlDeviceGetCount() for num in range(devicecount): handle = pynvml.nvmlDeviceGetHandleByIndex(num) info = pynvml.nvmlDeviceGetMemoryInfo(handle) meminfo[ num] = "Device: {} , {} / {} {:.2f}%, free memory:{}".format( num, info.used, info.total, info.used / info.total * 100, info.free) for i in range(len(meminfo)): infoStr += meminfo[i] + "\n" # mainlog(infoStr,'info') return infoStr except Exception as e: #mainlog(e, 'error') # print("error happen in getGPUstate:"+str(e)) return "出现错误 Error:" + str(e)
def get_device_total_memory(index=0): """ Return total memory of CUDA device with index """ pynvml.nvmlInit() return pynvml.nvmlDeviceGetMemoryInfo( pynvml.nvmlDeviceGetHandleByIndex(index)).total
def avg_gpu_info(measure_duration, print_info=False): """ Input: measure_duration: int Output: avg_free_memory: numpy.array[int], len=gpu_count avg_gpu_util: numpy.array[int], len=gpu_count """ # Get average gpu status pynvml.nvmlInit() #初始化 gpu_count = pynvml.nvmlDeviceGetCount() handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpu_count)] avg_free_memory = [0.0] * gpu_count avg_gpu_util = [0.0] * gpu_count for _ in range(int(measure_duration)): for id, handle in enumerate(handles): avg_free_memory[id] = avg_free_memory[ id] + pynvml.nvmlDeviceGetMemoryInfo(handle).free / 1e6 avg_gpu_util[id] = avg_gpu_util[ id] + pynvml.nvmlDeviceGetUtilizationRates(handle).gpu time.sleep(1) avg_free_memory = np.array( [int(memory / measure_duration) for memory in avg_free_memory]) avg_gpu_util = np.array( [int(power / measure_duration) for power in avg_gpu_util]) if print_info: present_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logging.info(present_time) for gpu_id in range(gpu_count): gpu_info = 'GPU%d: gpu util:%d%% | free memory:%dMiB' % ( gpu_id, avg_gpu_util[gpu_id], avg_free_memory[gpu_id]) logging.info(gpu_info) return avg_free_memory, avg_gpu_util
def watch_gpu(k): import pynvml import time import torch pynvml.nvmlInit() gpu_num = [0, 1, 2, 3, 4, 5, 6, 7] while True: for i in gpu_num: handle = pynvml.nvmlDeviceGetHandleByIndex(i) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) free = meminfo.free / 1024**2 if free >= 2000: print("第%s号卡存在剩余空间: " % i, free) os.environ['CUDA_VISIBLE_DEVICES'] = str(i) # a = torch.rand([1,3,500,500]) from models.common import GPUModel model = GPUModel(120 * k) model.cuda() print_here = True while True: if print_here: print("已经完成") print_here = False time.sleep(1)
def gpus_available() -> dict: try: nvmlInit() gpus = {} visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None) if visible_devices is None: visible_devices = list(range(nvmlDeviceGetCount())) else: visible_devices = {int(x.strip()) for x in visible_devices.split(',')} for i, real_id in enumerate(visible_devices): h = nvmlDeviceGetHandleByIndex(real_id) info = nvmlDeviceGetMemoryInfo(h) total = info.total free = info.free ratio = free / total gpus[i] = ratio # print(f'total : {info.total}') # print(f'free : {info.free}') # print(f'used : {info.used}') # t = torch.cuda.get_device_properties(0).total_memory # c = torch.cuda.memory_cached(0) # a = torch.cuda.memory_allocated(0) # print(t, c, a) nvmlShutdown() return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True)) except Exception as e: logger.debug(f'Failed to get gpu info due to {e}') return {}
def get_available_device(args=[], init=True): """Convenience function that gets available GPU units and returns a string on the pattern f"/GPU:{i}" telling the index of the one currently using the lowest memory. Also sets the environment variable 'CUDA_VISIBLE_DEVICES' to f'{i}'. If there is an NVMLError in the attempt to get this information, `i` defaults to a pre-selected unit. If `args` has len > 1, the second argument is the integer index of the GPU. """ #os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' if init: nv.nvmlInit() if len(args) <= 1: try: devices = map(nv.nvmlDeviceGetHandleByIndex, range(nv.nvmlDeviceGetCount())) #ind,device devices_enum = sorted( enumerate(devices), key=lambda d: nv.nvmlDeviceGetMemoryInfo(d[1]).free, reverse=True) ind = devices_enum[0][0] except nv.NVMLError as e: print(e) print("> > > error occurred: defaulting to gpu3") ind = 3 else: ind = args[1] print('*\t*\t*\t*\t*\t*\t*\tget_available_device(): using device', ind) os.environ['CUDA_VISIBLE_DEVICES'] = f'{ind}' return '/GPU:%i' % ind
def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info( 'Using largest free memory GPU {} with free memory {}MB'.format( gpu_id, largest_free_mem)) return gpu_id else: logging.info( 'nvidia-ml-py is not installed, automatically select gpu is disabled!' ) return '0'
def _get_vram(self): """ Obtain the total VRAM in Megabytes for each connected GPU. Returns ------- list List of floats containing the total amount of VRAM in Megabytes for each connected GPU as corresponding to the values in :attr:`_handles """ self._initialize() if self._device_count == 0: vram = list() elif self._is_plaidml: vram = self._plaid.vram elif IS_MACOS: vram = [ pynvx.cudaGetMemTotal(handle, ignore=True) / (1024 * 1024) for handle in self._handles ] else: vram = [ pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024) for handle in self._handles ] self._log("debug", "GPU VRAM: {}".format(vram)) return vram
def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem)) return gpu_id else: logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!') return '0'
def one_time(): init_once() h = _pynvml_handles() return { "memory-total": pynvml.nvmlDeviceGetMemoryInfo(h).total, "name": pynvml.nvmlDeviceGetName(h).decode(), }
def setVisibleGpu(self): ''' 设置可用gpu编号 ''' num_gpu = self.opt.BASE.NUM_GPUS gpu_list = [str(i) for i in self.opt.BASE.GPU_ID] os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu_list[:num_gpu]) '''检测gpu使用情况''' import pynvml pynvml.nvmlInit() # 这里的1是GPU id handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_list[0])) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) total = meminfo.total # 第二块显卡总的显存大小 used = meminfo.used # 这里是字节bytes,所以要想得到以兆M为单位就需要除以1024**2 ratio = used / total if ratio > 0.5: flag = True while flag == True: ans = input( "More than 50% resource has been occupied on GPU{0}, are you sure to continue?(y/n)" .format(str(gpu_list[0]))) if ans == 'n': exit(0) elif ans == 'y': flag = False
def log_gpu_memory(): handle = pynvml.nvmlDeviceGetHandleByIndex(0) info = pynvml.nvmlDeviceGetMemoryInfo(handle) info.free = round(info.free / 1024**2) info.used = round(info.used / 1024**2) print('GPU memory free: {}, memory used: {}'.format(info.free, info.used)) return info.used
def seeGmemorys(gpu_ids, tag=None): global count global old_Mb Mb = [] pynvml.nvmlInit() for gpu_id in gpu_ids: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) m = meminfo.used / 1024 / 1024 Mb.append(m) ## B --> MB if (np.array(Mb) < 10000).any(): SendMail(_subject='{} stop'.format(tag), _content='GPU free') return True # elif (np.array(Mb) > np.array(old_Mb)).any(): # SendMail(_subject = '{} increase'.format(tag), _content = 'GPU increase') # return False else: count += 1 old_Mb = Mb str = ', '.join([ 'id = {}, memory = {} Mb'.format(item[0], item[1]) for _, item in enumerate(zip(gpu_ids, Mb)) ]) print('spy {} times. {}'.format(count, str)) return False
def _select_device(gpu): import os from numpy import argmax logger = getLogger("clinicadl") if not gpu: return "cpu" else: # TODO: Add option gpu_device (user chooses the gpu) # How to perform multi-GPU ? try: # In this case, the GPU seen by cuda are restricted and we let cuda choose _ = os.environ["CUDA_VISIBLE_DEVICES"] return "cuda" except KeyError: # Else we choose ourselves the GPU with the greatest amount of memory from pynvml import ( nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit, ) nvmlInit() memory_list = [ nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)).free for i in range(torch.cuda.device_count()) ] free_gpu = argmax(memory_list) return f"cuda:{free_gpu}"
def track(self): """ Track the GPU memory usage """ pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(self.device) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) self.curr_line = self.frame.f_lineno where_str = self.module_name + ' ' + self.func_name + ':' + ' line ' + str(self.curr_line) with open(self.gpu_profile_fn, 'a+') as f: if self.begin: f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |" f" Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") self.begin = False if self.print_detail is True: ts_list = [tensor.size() for tensor in self.get_tensors()] new_tensor_sizes = {(type(x), tuple(x.size()), ts_list.count(x.size()), np.prod(np.array(x.size()))*4/1000**2) for x in self.get_tensors()} for t, s, n, m in new_tensor_sizes - self.last_tensor_sizes: f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20}\n') for t, s, n, m in self.last_tensor_sizes - new_tensor_sizes: f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} \n') self.last_tensor_sizes = new_tensor_sizes f.write(f"\nAt {where_str:<50}" f"Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") pynvml.nvmlShutdown()
def get_free(self): """ Return the vram available """ self.initialize() vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024) for handle in self.handles] self.shutdown() return vram
def _get_free_vram(self): """ Obtain the amount of VRAM that is available, in Megabytes, for each connected GPU. Returns ------- list List of floats containing the amount of VRAM available, in Megabytes, for each connected GPU as corresponding to the values in :attr:`_handles Notes ----- There is no useful way to get free VRAM on PlaidML. OpenCL loads and unloads VRAM as required, so this returns the total memory available per card for AMD cards, which us not particularly useful. """ self._initialize() if self._is_plaidml: vram = self._plaid.vram elif IS_MACOS: vram = [ pynvx.cudaGetMemFree(handle, ignore=True) / (1024 * 1024) for handle in self._handles ] else: vram = [ pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024) for handle in self._handles ] self._shutdown() self._log("debug", "GPU VRAM free: {}".format(vram)) return vram
def cb(): nonlocal last_time now = time.time() src_dict = {"time": [now * 1000]} gpu_tot = 0 mem_tot = 0 tx_tot = 0 rx_tot = 0 for i in range(ngpus): gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used gpu_tot += gpu mem_tot += mem / (1024 * 1024) if pci_gen is not None: tx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024) rx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024) rx_tot += rx tx_tot += tx src_dict["gpu-" + str(i)] = [gpu] src_dict["memory-" + str(i)] = [mem] src_dict["gpu-total"] = [gpu_tot / ngpus] src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100] src_dict["tx-total"] = [tx_tot] src_dict["rx-total"] = [rx_tot] source.stream(src_dict, 1000) last_time = now
def get_gpu_mem_used(): try: from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) totalMemory = nvmlDeviceGetMemoryInfo(handle) return totalMemory.used except Exception: return -1
def get_used(self): """ Return the vram in use """ self.initialize() vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).used / (1024 * 1024) for handle in self.handles] self.shutdown() if self.verbose: print("GPU VRAM used: {}".format(vram)) return vram
def get_free(self): """ Return the vram available """ self.initialize() if IS_MACOS: vram = [pynvx.cudaGetMemFree(handle, ignore=True) / (1024 * 1024) for handle in self.handles] else: vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024) for handle in self.handles] self.shutdown() if self.logger: self.logger.debug("GPU VRAM free: %s", vram) return vram
def get_memory_information(handle): mem_total = -1 mem_used = -1 mem_percent = -1 try: memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 mem_percent = (float(memInfo.used) / memInfo.total) * 100. except Exception: pass return mem_used, mem_total, mem_percent
def get_used(self): """ Return the vram in use """ self.initialize() if IS_MACOS: vram = [pynvx.cudaGetMemUsed(handle, ignore=True) / (1024 * 1024) for handle in self.handles] else: vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).used / (1024 * 1024) for handle in self.handles] self.shutdown() if self.logger: self.logger.verbose("GPU VRAM used: %s", vram) return vram
def get_vram(self): """ Return total vram in megabytes per device """ self.initialize() if self.device_count == 0: vram = list() elif IS_MACOS: vram = [pynvx.cudaGetMemTotal(handle, ignore=True) / (1024 * 1024) for handle in self.handles] else: vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024) for handle in self.handles] if self.logger: self.logger.debug("GPU VRAM: %s", vram) return vram
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def request_mem(mem_mb, i_am_nice=True): # titanx' mem: 12,881,559,552 bytes # 12*1024*1024*1024 = 12,884,901,888 mem = mem_mb * 1024 * 1024 nvml.nvmlInit() # n = nvml.nvmlDeviceGetCount() try: handle = nvml.nvmlDeviceGetHandleByIndex(0) info = nvml.nvmlDeviceGetMemoryInfo(handle) cap = info.total * nice_ratio # req = cap if mem > cap and i_am_nice else mem req = mem if req > cap and i_am_nice: raise MemoryError('You are supposed to be polite..') if req > info.free: raise MemoryError('Cannot fullfil the gpumem request') return req / info.free finally: nvml.nvmlShutdown()
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%"+"(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def get_vram(self): """ Return total vram in megabytes per device """ vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 * 1024) for handle in self.handles] return vram
def printGPUINFO(): gpu_id = config.GPU_ID gpu_obj = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) print ("gup mem used:", pynvml.nvmlDeviceGetMemoryInfo(gpu_obj).used/1024/1024, "MB")