def count_devices() -> int: """ Returns the number of available GPU devices installed on the host. Returns: int: The number of available devices. """ query = smi.getInstance().DeviceQuery('count') if query: return int(query['count']) return 0
def devices_index() -> List[int]: """ Returns an index list, containing the device index for each available GPU. Returns: list: A list with all available devices index. """ query = smi.getInstance().DeviceQuery('index') if query: return list(range(len(query['gpu']))) return list()
def nvidia_driver_version() -> Tuple[Optional[int], Optional[int]]: """ Returns the nvidia driver version. Returns: tuple: A tuple with major and minor driver version. """ query = smi.getInstance().DeviceQuery('driver_version') if query: _version = query['driver_version'].split('.') return int(_version[0]), int(_version[1]) return None, None
def __init__(self) -> None: try: from pynvml.smi import nvidia_smi except ImportError: raise RuntimeError( "This contrib module requires pynvml to be installed. " "Please install it with command: \n pip install pynvml") # Let's check available devices if not torch.cuda.is_available(): raise RuntimeError("This contrib module requires available GPU") # Let it fail if no libnvidia drivers or NMVL library found self.nvsmi = nvidia_smi.getInstance() super(GpuInfo, self).__init__()
def getDeviceInfo(require): if require == 'overview': nvmlInit() print("Driver Version:", nvmlSystemGetDriverVersion()) deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) print("Device", i, ":", nvmlDeviceGetName(handle)) elif require == 'nvsmi': nvsmi = nvidia_smi.getInstance() nvsmi = nvsmi.DeviceQuery('memory.free, memory.total') return nvsmi else: raise ValueError(require)
def main(): app = connexion.App(__name__, specification_dir='./swagger/') app.app.json_encoder = encoder.JSONEncoder app.add_api('swagger.yaml', arguments={'title': 'midgard API'}, pythonic_params=True) try: import pynvml from pynvml.smi import nvidia_smi app.app.nvsmi = nvidia_smi.getInstance() except Exception as e: flask_logger.error( "Failed to load NVML. This node cannot produce GPU information", exc_info=True) app.app.nvsmi = None app.run(port=args.port)
def daemon_process(time_interval, json_path, gpu_index=0): gpu_memory_max = 0 while True: nvsmi = nvidia_smi.getInstance() dictm = nvsmi.DeviceQuery('memory.free, memory.total') gpu_memory = dictm['gpu'][gpu_index]['fb_memory_usage'][ 'total'] - dictm['gpu'][gpu_index]['fb_memory_usage']['free'] print("gpu_memory", gpu_memory) # if os.path.exists(json_path): # with open(json_path)as f: # js = json.load(f) # else: # js = { # 'gpu_memory':[] # } # with open(json_path, 'w')as f: # #js['gpu_memory'] = gpu_memory_max # js['gpu_memory'].append(gpu_memory) # json.dump(js, f, indent=4) time.sleep(time_interval)
def __init__(self, *args, **kwargs): super(GPUKernels, self).__init__(*args, **kwargs) self.nvsmi = nvidia_smi.getInstance()
def get_query_dict(filters: List[str]) -> Dict: """get_query_dict""" return smi.getInstance().DeviceQuery(', '.join(filters))
def getMemoryUsage(): nvsmi = nvidia_smi.getInstance() usage = nvsmi.DeviceQuery("memory.used")["gpu"][0]["fb_memory_usage"] return "%d %s" % (usage["used"], usage["unit"])
from pynvml.smi import nvidia_smi import psutil config = toml.load('config.toml') #申请内存大小 memory = config["MEMORY"] #申请显存大小 v_memory = config["V_MEMORY"] while(True): used_memory = 0 used_memory = psutil.virtual_memory().used if(used_memory > memory * 1024 * 1024 * 1024): print("内存消耗大于申请量") nvsmi = nvidia_smi.getInstance() results = nvsmi.DeviceQuery('memory.used') ##{'gpu': [{'fb_memory_usage': {'used': 0.0625, 'unit': 'MiB'}}, {'fb_memory_usage': {'used': 0.0625, 'unit': 'MiB'}}]} used_v_memory = 0 for item in results['gpu']: used = item['fb_memory_usage']['used'] used_v_memory += used if(used_v_memory > v_memory * 1024): print("显存消耗大于申请量")
def gpu_mem(): from pynvml.smi import nvidia_smi nvsmi = nvidia_smi.getInstance() return nvsmi.DeviceQuery("memory.free, memory.total")
def __borrarMemoria(self): nvsmi = nvidia_smi.getInstance() nvsmi.DeviceQuery('memory.free, memory.total') torch.cuda.empty_cache() gc.collect()
def smi(request): return nvidia_smi.getInstance()
def get_gpu_infos(self): nvsmi = nvidia_smi.getInstance() gpu_infos = nvsmi.DeviceQuery("index, uuid, name") self.logger.debug(f"Got device info from nvidia-smi: {gpu_infos}") return gpu_infos
def _main_func(): try: # first get name import torch as th import os except: self.P("ERROR: PyTorch not installed! Please install Pytorch.") return None nvsmires = None try: from pynvml.smi import nvidia_smi import pynvml nvsmi = nvidia_smi.getInstance() nvsmires = nvsmi.DeviceQuery('memory.free, memory.total, memory.used, utilization.gpu, temperature.gpu') pynvml_avail = True except: pynvml_avail = False lst_inf = [] # now we iterate all devices n_gpus = th.cuda.device_count() if n_gpus > 0: th.cuda.empty_cache() current_pid_has_usage = False current_pid_gpus = [] try: for device_id in range(n_gpus): dct_device = {} device_props = th.cuda.get_device_properties(device_id) dct_device['NAME'] = device_props.name dct_device['TOTAL_MEM'] = round( device_props.total_memory / 1024 ** (2 if mb else 3), 2 ) mem_total = None mem_allocated = None gpu_used = None gpu_temp = None gpu_temp_max = None if pynvml_avail and nvsmires is not None and 'gpu' in nvsmires: dct_gpu = nvsmires['gpu'][device_id] mem_total = round( dct_gpu['fb_memory_usage']['total'] / (1 if mb else 1024), 2 ) # already from th mem_allocated = round( dct_gpu['fb_memory_usage']['used'] / (1 if mb else 1024), 2 ) gpu_used = dct_gpu['utilization']['gpu_util'] if isinstance(gpu_used, str): gpu_used = -1 gpu_temp = dct_gpu['temperature']['gpu_temp'] gpu_temp_max = dct_gpu['temperature']['gpu_temp_max_threshold'] handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) processes = [] for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle): dct_proc_info = {k.upper(): v for k,v in proc.__dict__.items()} used_mem = dct_proc_info.pop('USEDGPUMEMORY', None) dct_proc_info['ALLOCATED_MEM'] = round( used_mem / 1024 ** (2 if mb else 3) if used_mem is not None else 0.0, 2 ) processes.append(dct_proc_info) if dct_proc_info['PID'] == os.getpid(): current_pid_has_usage = True current_pid_gpus.append(device_id) #endfor dct_device['PROCESSES'] = processes dct_device['USED_BY_PROCESS'] = device_id in current_pid_gpus else: str_os = platform.platform() ## check if platform is Tegra and record if 'tegra' in str_os.lower(): # we just record the overall fre memory mem_total = self.get_machine_memory() mem_allocated = mem_total - self.get_avail_memory() gpu_used = 1 gpu_temp = 1 gpu_temp_max = 100 if not self._done_first_smi_error and nvsmires is not None: self.P("Running `gpu_info` on Tegra platform: {}".format(nvsmires), color='r') self._done_first_smi_error = True elif not self._done_first_smi_error: str_log = "ERROR: Please make sure you have both pytorch and pynvml in order to monitor the GPU" str_log += "\nError info: pynvml_avail={}, nvsmires={}".format(pynvml_avail, nvsmires) self.P(str_log) self._done_first_smi_error = True #endif dct_device['ALLOCATED_MEM'] = mem_allocated dct_device['FREE_MEM'] = -1 if all(x is not None for x in [mem_total, mem_allocated]): dct_device['FREE_MEM'] = round(mem_total - mem_allocated,2) dct_device['MEM_UNIT'] = 'MB' if mb else 'GB' dct_device['GPU_USED'] = gpu_used dct_device['GPU_TEMP'] = gpu_temp dct_device['GPU_TEMP_MAX'] = gpu_temp_max lst_inf.append(dct_device) #end for all devices except Exception as e: self.P("gpu_info exception for device_id {}:\n{}".format(device_id, e), color='r') if show: self.P("GPU information for {} device(s):".format(len(lst_inf)), color='y') for dct_gpu in lst_inf: for k, v in dct_gpu.items(): self.P(" {:<14} {}".format(k + ':', v), color='y') if current_pid and current_pid_has_usage: return [lst_inf[x] for x in current_pid_gpus] else: return lst_inf
def query_gpu(*fields) -> List[Dict]: nvsmi = nvidia_smi.getInstance() gpu_infos = nvsmi.DeviceQuery(','.join(fields)) return gpu_infos["gpu"]
def gpus_snap_info(): nvsmi = nvidia_smi.getInstance() return nvsmi.DeviceQuery( "memory.free,memory.total,memory.used,compute-apps,temperature.gpu,driver_version,timestamp,name" )