def get(): handles = [] output = [] for device_id in nvidia_smi.nvmlDeviceGetCount(): handles.append(nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)) for handle in handles: res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) output.append({'usage': res.gpu, 'memory': res.memory}) return output
def log_gpu_memory_to_tensorboard(): ''' Log every gpus current free memory level to tensorboard. ''' for i in range(nvidia_smi.nvmlDeviceGetCount()): info = nvidia_smi.nvmlDeviceGetMemoryInfo(gpus[i]) with loggers[i].as_default(): tl.summary({'free': np.array(info.free) / (1024**3)}, step=int(time.time()), name='GPUs')
def check_cuda_memory(): nvidia_smi.nvmlInit() deviceCount = nvidia_smi.nvmlDeviceGetCount() for i in range(deviceCount): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)"\ .format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, \ info.total, info.free, info.used)) nvidia_smi.nvmlShutdown() return
def __init__(self, exclude_gpu_ids: list = []): """ Usage: g = GPUAllocator() gpu_id = g.get_gpu() ## do something with gpu_id g.set_as_free(gpu_id) """ nvidia_smi.nvmlInit() self.num_gpus = nvidia_smi.nvmlDeviceGetCount() self.gpu_names = [] for i in range(self.num_gpus): if i in exclude_gpu_ids: pass else: self.gpu_names.append('cuda:' + str(i)) self.usage = {} for i in range(self.num_gpus): if i in exclude_gpu_ids: pass else: self.usage[i] = False """ on a good day, this is how the variables look like: self.num_gpus= 2 self.gpu_names= [ 'cuda:0', 'cuda:1' ] self.usage= { 0: False, 1: False } """ print( "[" + Colors.CYAN+ "EDEN" +Colors.END+ "] " + 'Initialized GPUAllocator with devices: ', self.gpu_names) """
def init_gpu_writers(logdir): global gpus, loggers ''' Set up tensorboard file writers. ''' for i in range(nvidia_smi.nvmlDeviceGetCount()): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) gpus.append(handle) name = nvidia_smi.nvmlDeviceGetName(handle).decode().replace( ' ', '-') + ':' + str(int(i)) loggers.append( tf.summary.create_file_writer( os.path.join(logdir, os.uname().nodename, name)))
def get_SystemStats(process, NVIDIA_GPU): if NVIDIA_GPU: deviceCount = nvidia_smi.nvmlDeviceGetCount() gpu_memory = [] gpu_utilization = [] for i in range(0, deviceCount): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) gpu_stat = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) gpu_memory.append(gpu_stat.memory) gpu_utilization.append(gpu_stat.gpu) else: gpu_memory = [] gpu_utilization = [] sys_memory = process.memory_info()[0] / 2. ** 30 return gpu_memory, gpu_utilization, sys_memory
def set_cluster_client(n_gpus=-1, device_spill_frac=0.8): # TODO: Check for any solution. If user calls this function, for the second call the correct recreation will fail. # New cluster can be created after 'kernel restart' procedure. ''' device_spill_frac: Spill GPU-Worker memory to host at this limit. Reduce if spilling fails to prevent device memory errors. ''' if os.path.isdir("dask-worker-space"): shutil.rmtree('dask-worker-space', ignore_errors=True) # Deploy a Single-Machine Multi-GPU Cluster if n_gpus == -1: nvidia_smi.nvmlInit() n_gpus_avail = nvidia_smi.nvmlDeviceGetCount() print('\n n_gpus_avail: {}'.format(n_gpus_avail)) n_gpus = n_gpus_avail # Delect devices to place workers visible_devices = [i for i in list(range(n_gpus))] visible_devices = str(visible_devices)[1:-1] #print('visible_devices: {}'.format(visible_devices)) #TODO: how to reinitialzed cluster cluster = LocalCUDACluster( protocol="tcp", # "tcp" or "ucx" CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_spill_frac * device_mem_size(kind="total"), ) try: # Create the distributed client client = Client(cluster) display(client) print('\n Dashboard avail: http://localhost:8888/proxy/8787/status') # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( pool_allocator=True, initial_pool_size=None, # Use default size ) client.run(_rmm_pool) return client except MemoryError: print('\n The client is already initialized')
def nvapi(): nvmlInit() ret = {} n_gpus = int(nvmlDeviceGetCount()) ret['n_gpus'] = n_gpus for i in range(n_gpus): gpu_str = '{}.'.format(i) gpu_obj = nvmlDeviceGetHandleByIndex(i) ret[gpu_str + 'temp'] = nvmlDeviceGetTemperature( gpu_obj, NVML_TEMPERATURE_GPU) this_ram = nvmlDeviceGetMemoryInfo(gpu_obj) ret[gpu_str + 'ram.used'] = this_ram.used / MB ret[gpu_str + 'ram.total'] = this_ram.total / MB ret[gpu_str + 'power.current'] = nvmlDeviceGetPowerUsage(gpu_obj) / 1000.0 ret[gpu_str + 'power.limit'] = nvmlDeviceGetEnforcedPowerLimit(gpu_obj) / 1.0 ret[gpu_str + 'util'] = nvmlDeviceGetUtilizationRates(gpu_obj).gpu / 1.0 nvmlShutdown() return ret
def __get_gpu_info(self): def parse_unit(val, scale=1000): unit_ls = ['B', 'KB', 'MB', 'GB'] unit_lv = 0 while val >= scale: val /= scale unit_lv += 1 if unit_lv == len(unit_ls) - 1: break return '{:.2f} {}'.format(val, unit_ls[unit_lv]) sum_info = [] process_ls = [] nv.nvmlInit() gpu_num = nv.nvmlDeviceGetCount() # 遍历每块卡 for gpu_idx in range(gpu_num): h = nv.nvmlDeviceGetHandleByIndex(gpu_idx) dev_name = nv.nvmlDeviceGetName(h).decode() raw_total_mem = nv.nvmlDeviceGetMemoryInfo(h).total total_mem = parse_unit(raw_total_mem, 1024) raw_used_mem = nv.nvmlDeviceGetMemoryInfo(h).used used_mem = parse_unit(raw_used_mem, 1024) gpu_util = '{:.2f}'.format(nv.nvmlDeviceGetUtilizationRates(h).gpu) gpu_mem_util = '{:.2f}'.format(raw_used_mem * 100 / raw_total_mem) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['total_mem'] = total_mem tmp['used_mem'] = used_mem tmp['gpu_util'] = gpu_util tmp['gpu_mem_util'] = gpu_mem_util sum_info.append(tmp) running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h) for obj in running_process_obj_ls: process_pid = obj.pid process_type = 'C' process_raw_gpu_mem = obj.usedGpuMemory process_name = nv.nvmlSystemGetProcessName( process_pid).decode() ctan_name = self.get_ctan_name_by_pid(process_pid) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['process_pid'] = str(process_pid) tmp['process_type'] = process_type tmp['process_name'] = process_name tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024) tmp['ctan_name'] = ctan_name process_ls.append(tmp) running_process_obj_ls = nv.nvmlDeviceGetGraphicsRunningProcesses( h) for obj in running_process_obj_ls: process_pid = obj.pid process_type = 'G' process_raw_gpu_mem = obj.usedGpuMemory process_name = nv.nvmlSystemGetProcessName( process_pid).decode() ctan_name = self.get_ctan_name_by_pid(process_pid) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['process_pid'] = str(process_pid) tmp['process_type'] = process_type tmp['process_name'] = process_name tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024) tmp['ctan_name'] = ctan_name process_ls.append(tmp) return sum_info, process_ls
def __get_ctan_verbose_stats(self, name): # 连续获得参数 def graceful_chain_get(d, *args, default=None): t = d for a in args: try: t = t[a] except (KeyError, ValueError, TypeError, AttributeError): return default return t # 计算cpu使用占比 def calculate_cpu_percent2(d, previous_cpu_total=None, previous_cpu_system=None): cpu_percent = 0.0 cpu_total = float(d["cpu_stats"]["cpu_usage"]["total_usage"]) if previous_cpu_total is None: previous_cpu_total = cpu_total cpu_delta = cpu_total - previous_cpu_total cpu_system = float(d["cpu_stats"]["system_cpu_usage"]) if previous_cpu_system is None: previous_cpu_system = cpu_system system_delta = cpu_system - previous_cpu_system online_cpus = d["cpu_stats"].get( "online_cpus", len(d["cpu_stats"]["cpu_usage"]["percpu_usage"])) if system_delta > 0.0: cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0 return cpu_percent, cpu_total, cpu_system # 计算IO def calculate_blkio_bytes(d): """ :param d: :return: (read_bytes, wrote_bytes), ints """ bytes_stats = graceful_chain_get(d, "blkio_stats", "io_service_bytes_recursive") if not bytes_stats: return 0, 0 r = 0 w = 0 for s in bytes_stats: if s["op"] == "Read": r += s["value"] elif s["op"] == "Write": w += s["value"] return r, w # 计算网络 def calculate_network_bytes(d): """ :param d: :return: (received_bytes, transceived_bytes), ints """ networks = graceful_chain_get(d, "networks") if not networks: return 0, 0 r = 0 t = 0 for if_name, data in networks.items(): r += data["rx_bytes"] t += data["tx_bytes"] return r, t def calculate_mem_bytes(d): mem_limit = d['memory_stats']['limit'] mem_usage = d['memory_stats']['usage'] return mem_usage, mem_limit def parse_unit(val, scale=1000): unit_ls = ['B', 'KB', 'MB', 'GB'] unit_lv = 0 while val >= scale: val /= scale unit_lv += 1 if unit_lv == len(unit_ls) - 1: break return '{:.2f} {}'.format(val, unit_ls[unit_lv]) if name not in self.user_stats_stream: # print('add {} into user_stats_stream'.format(name)) ctan = self.containers.get(name) self.user_stats_stream[name] = ctan.stats(decode=True) # 通过数据流获取信息 if self.containers.get(name).status == 'running': raw_stats = self.user_stats_stream[name].__next__() pre_cpu_stats = self.pre_cpu_stats[name] else: return None # cpu cpu_percent, cpu_total, cpu_system = calculate_cpu_percent2( raw_stats, pre_cpu_stats[0], pre_cpu_stats[1]) self.pre_cpu_stats[name] = [cpu_total, cpu_system] # 更新usage # blk read_blk, write_blk = calculate_blkio_bytes(raw_stats) # net read_net, write_net = calculate_network_bytes(raw_stats) # mem mem_usage, mem_limit = calculate_mem_bytes(raw_stats) # user gpu gpu_all_mem, gpu_used_mem, gpu_used_pcnt = 0, 0, 0 gpu_num = nv.nvmlDeviceGetCount() for gpu_idx in range(gpu_num): h = nv.nvmlDeviceGetHandleByIndex(gpu_idx) running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h) for obj in running_process_obj_ls: process_pid = obj.pid process_raw_gpu_mem = obj.usedGpuMemory ctan_name = self.get_ctan_name_by_pid(process_pid) if ctan_name == name: gpu_used_mem += process_raw_gpu_mem gpu_all_mem += nv.nvmlDeviceGetMemoryInfo(h).total ret_dt = { 'id': raw_stats['id'], 'pid': str(raw_stats['pids_stats']['current']), 'cpu_percent': '{:.2f}'.format(cpu_percent), 'read_blk': parse_unit(read_blk), 'write_blk': parse_unit(write_blk), 'read_net': parse_unit(read_net), 'write_net': parse_unit(write_net), 'mem_usage': parse_unit(mem_usage, scale=1024), 'mem_limit': parse_unit(mem_limit, scale=1024), 'mem_usage_pcnt': '{:.2f}'.format(mem_usage / mem_limit * 100), 'gpu_mem_usage': parse_unit(gpu_used_mem, 1024), 'gpu_mem_limit': parse_unit(gpu_all_mem, 1024), 'gpu_mem_usage_pcnt': '{:.2f}'.format(gpu_used_mem / gpu_all_mem * 100) } return ret_dt
:return mem_used: used memory in MiB :return mem_total: total memory in MiB """ if gpu_id is None: gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) handle = nvidia_smi.nvmlDeviceGetHandleByIndex(int(gpu_id)) mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) mem_used = mem_res.used / (1024**2) mem_total = mem_res.total / (1024**2) return mem_used, mem_total, gpu_id def print_gpu_info(gpu_id=None): """ Print gpu-info regarding gpu_id on console :param gpu_id: gpu bus id """ if gpu_id is None: gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) mem_used, mem_total, gpu_id = get_gpu_info(gpu_id=int(gpu_id)) print("GPU({}): {:.2f}MiB / {:.2f}MiB".format(gpu_id, mem_used, mem_total)) if __name__ == '__main__': eager_setup() x = tf.random.normal(shape=(100, 1000)) for id in range(nvidia_smi.nvmlDeviceGetCount()): mem_used, mem_total, gpu_id = get_gpu_info(gpu_id=id) print("GPU({}): {:.2f}MiB / {:.2f}MiB".format(id, mem_used, mem_total))
def check_gpu(): try: nvidia_smi.nvmlInit() return nvidia_smi.nvmlDeviceGetCount() > 0 except: return False
def __enter__(self): if not NvidiaSmi.init: nvidia_smi.nvmlInit() NvidiaSmi.total_devices = nvidia_smi.nvmlDeviceGetCount() NvidiaSmi.init = True return self