def get_gpu_info_by_nvml(self) -> Dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None try: nvmlInit() driver_version = nvmlSystemGetDriverVersion() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) gpu_info = {} gpu_info["memory_total"] = info.total gpu_info["memory_available"] = info.free gpu_info["name"] = nvmlDeviceGetName(handle) gpu_info_list.append(gpu_info) nvmlShutdown() except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None result = {"driver_version": driver_version, "devices": gpu_info_list} if 'CUDA_VISIBLE_DEVICES' in environ: result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES'] return result
def inference_speed_memory(self, batch_size, seq_length): # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) @jax.jit def ref_step(): out = self.model(input_ids=input_ids) return out[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() ref_step().block_until_ready() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals()) else: with jax.disable_jit(): runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def getCUDAEnvironment(): """ Get the CUDA runtime environment parameters (number of cards etc.). """ rdict = dict() rdict['first_available_device_index'] = None rdict['device_count'] = 0 try: nvml.nvmlInit() rdict['device_count'] = nvml.nvmlDeviceGetCount() except Exception: print( 'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.' ) return rdict for i in range(rdict['device_count']): memory_info = nvml.nvmlDeviceGetMemoryInfo( nvml.nvmlDeviceGetHandleByIndex(i)) memory_usage_percentage = memory_info.used / memory_info.total if memory_usage_percentage <= 0.1: rdict['first_available_device_index'] = i break nvml.nvmlShutdown() return rdict
def gpu_profile(frame, event, arg): # it is _about to_ execute (!) global last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: # about _previous_ line (!) if lineno is not None: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ['GPU_DEBUG'])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ':' + str(lineno) with open(gpu_profile_fn, 'a+') as f: f.write(f"{where_str:<50}" f":{meminfo.used/1024**2:<7.1f}Mb " f"{line.rstrip()}\n") if print_tensor_sizes is True: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() # save details about line _to be_ executed lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] if (filename.endswith(".pyc") or filename.endswith(".pyo")): filename = filename[:-1] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'gmwda-pytorch' not in os.path.dirname( os.path.abspath(filename)): lineno = None # skip current line evaluation if ('car_datasets' in filename or '_exec_config' in func_name or 'gpu_profile' in module_name or 'tee_stdout' in module_name): lineno = None # skip current return gpu_profile except (KeyError, AttributeError) as e: print(e) return gpu_profile
def measure_gpu_usage(self): from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \ nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError max_gpu_usage = [] gpu_name = [] try: nvmlInit() deviceCount = nvmlDeviceGetCount() max_gpu_usage = [0 for i in range(deviceCount)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount) ] while True: for i in range(deviceCount): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i] } for i in range(deviceCount)] except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework if self.framework == "PyTorch": info["use_torchscript"] = self.args.torchscript if self.framework == "TensorFlow": info["eager_mode"] = self.args.eager_mode info["use_xla"] = self.args.use_xla info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self.args.fp16 info["use_multiprocessing"] = self.args.do_multi_processing info["only_pretrain_model"] = self.args.only_pretrain_model if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory. " "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self.args.is_gpu if self.args.is_gpu: info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" info["use_tpu"] = self.args.is_tpu # TODO(PVP): See if we can add more information about TPU # see: https://github.com/pytorch/xla/issues/2180 self._environment_info = info return self._environment_info
def exit(self): """Overwrite the exit method to close the GPU API.""" if self.nvml_ready: try: pynvml.nvmlShutdown() except Exception as e: logger.debug("pynvml failed to shutdown correctly ({})".format(e)) # Call the father exit method super(Plugin, self).exit()
def gpu_profile(frame, event): global last_meminfo_used, last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: if lineno: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ["GPU_DEBUG"])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ' ' + str(lineno) new_meminfo_used = meminfo.used mem_display = new_meminfo_used - last_meminfo_used if use_incremental else new_meminfo_used with open(gpu_profile_fn, "a+") as f: f.write(f"{where_str:<50}" f":{(mem_display) / 1024 ** 2:<7.1f}Mb " f"{line.rstrip()}\n") last_meminfo_used = new_meminfo_used if print_tensor_sizes: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'Beta' not in os.path.dirname(os.path.abspath(filename)): lineno = None return gpu_profile except (KeyError, AttributeError): pass return gpu_profile
def stop(self): if self.gpu: nvmlShutdown() else: pass self.get_total_power() print("Total Consumed: %0.2f Wh" % self.sum) print( "Your Badge is Ready! See https://img.shields.io/badge/Power%20Consumption-{:.2f}%20Wh-green" .format(self.sum)) return self.sum
def nvml_manager(): """Context manager to initialise and shut down NVML.""" global pynvml if pynvml: try: pynvml.nvmlInit() except pynvml.NVMLError as error: print('Warning:', error, file=sys.stderr) pynvml = None yield pynvml if pynvml: pynvml.nvmlShutdown()
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]: """ Get driver version and list of ``GpuInfo``, if available. """ try: nvml.nvmlInit() except nvml.NVMLError: # Not available. return None, None driver_version: str = nvml.nvmlSystemGetDriverVersion() gpus: List[GpuInfo] = [] device_count: int = nvml.nvmlDeviceGetCount() for i in range(device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) name = try_get_info(nvml.nvmlDeviceGetName, handle) fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0) temp = try_get_info( lambda h: nvml.nvmlDeviceGetTemperature(h, nvml. NVML_TEMPERATURE_GPU), handle, default=0, ) mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) if mem_info: mem_used = mem_info.used >> 20 mem_total = mem_info.total >> 20 else: mem_used = 0 mem_total = 0 util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) if util: gpu_util = util.gpu else: gpu_util = 0 gpus.append( GpuInfo( id=i, name=name, mem_usage=mem_used, mem_capacity=mem_total, utilization=gpu_util, temp=temp, fan=fan_speed, )) nvml.nvmlShutdown() return driver_version, gpus
def environment_info(self): if self._environment_info is None: info = {} info["gluonnlp_version"] = gluonnlp.__version__ info["framework_version"] = mxnet.__version__ info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self._use_fp16 if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self._use_gpu if self._use_gpu: info["num_gpus"] = 1 if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState( handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" self._environment_info = info return self._environment_info
def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) max_gpu_usage = [] gpu_name = [] try: nvmlInit() device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): logger.error( f"nvmlDeviceGetCount result is not integer: {device_count}" ) return None max_gpu_usage = [0 for i in range(device_count)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(device_count) ] while True: for i in range(device_count): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): logger.error( f"nvmlDeviceGetMemoryInfo returns str: {info}") return None max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i], } for i in range(device_count)] except NVMLError as error: logger.error("Error fetching GPU information using nvml: %s", error) return None
def train_speed_memory(self, batch_size, seq_length): key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) labels = jax.random.randint(key, (batch_size, seq_length), 0, 2) # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # labels = np.random.randint(0,2, (batch_size, seq_length)) @jax.jit def train_step(): def loss_fn(params): token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype) logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0] loss, normalizing_factor = cross_entropy(logits,targets, token_mask) jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu") return loss / normalizing_factor if self.fp16 and jax.local_devices()[0].platform == 'gpu': grad_fn = self.dynamic_scale.value_and_grad(loss_fn) dyn_scale, is_fin, loss, grad = grad_fn(self.model.params) else: grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(self.model.params) return tree_flatten(grad)[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() train_step() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None # timeit.repeat(train_step,repeat=1,number=2) timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) else: with jax.disable_jit(): # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def test_nvidia(): # pip install py3nvml import py3nvml from py3nvml import py3nvml as nvml inspect(py3nvml.get_free_gpus()) nvml.nvmlInit() inspect(version=nvml.nvmlSystemGetDriverVersion()) inspect(count=nvml.nvmlDeviceGetCount()) for i in range(nvml.nvmlDeviceGetCount()): test_nvidia_device(i) nvml.nvmlShutdown()
def run_gpu_mem_counter(do_shutdown=False): # Sum used memory for all GPUs if not torch.cuda.is_available(): return 0 if do_shutdown: py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount()) ) #if gpus_to_trace is None else gpus_to_trace gpu_mem = 0 for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used if do_shutdown: py3nvml.nvmlShutdown() return gpu_mem
def get_free_gpus(max_procs=0): """ Checks the number of processes running on your GPUs. Parameters ---------- max_procs : int Maximum number of procs allowed to run on a gpu for it to be considered 'available' Returns ------- availabilities : list(bool) List of length N for an N-gpu system. The nth value will be true, if the nth gpu had at most max_procs processes running on it. Set to 0 to look for gpus with no procs on it. Note ---- If function can't query the driver will return an empty list rather than raise an Exception. """ # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warning(str_) return [] num_gpus = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * num_gpus for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) if len(procs) <= max_procs: gpu_free[i] = True py3nvml.nvmlShutdown() return gpu_free
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]: try: if self.args.trace_memory_line_by_line: trace = start_memory_tracing("transformers") if self.args.is_tpu: # tpu raise NotImplementedError( "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no-memory` or `args.memory=False`" ) elif self.args.is_gpu: if not is_py3nvml_available(): logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) memory = "N/A" else: logger.info( "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU." ) # init nvml nvml.nvmlInit() func() handle = nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: # cpu memory_bytes = measure_peak_memory_cpu(func) memory = Memory(memory_bytes) if isinstance( memory_bytes, int) else memory_bytes if self.args.trace_memory_line_by_line: summary = stop_memory_tracing(trace) else: summary = None return memory, summary except RuntimeError as e: self.print_fn(f"Doesn't fit on GPU. {e}") return "N/A", None
def set_affinity(rank, log_values=False): """ Sets the CPU affinity for the current process to be local to its respective GPU. NOTE: `rank` is considered to be equivalent to the GPU index. Certain systems have complex hardware topologies (such as our systems in Cosmos and AVDC), and as such, there are certain CPU cores that have faster interconnects with certain GPU cards. This function will force linux to only schedule the current process to run on CPU cores that are fast for the associated GPU. Args: rank (int): The rank of the current process (and GPU index). log_values (bool): Optionally log the before and after values. """ try: num_cpus = os.cpu_count() nvmlInit() rank_cpus = [] # nvmlSystemGetTopologyGpuSet prints the number of GPUs each time it's # called, so this will suppress those prints with IgnorePrint(): for i in range(num_cpus): for d in nvmlSystemGetTopologyGpuSet(i): d_index = nvmlDeviceGetIndex(d) if d_index == rank: rank_cpus.append(i) break process = psutil.Process() old_affinity = _dense_list_to_spans(process.cpu_affinity()) process.cpu_affinity(rank_cpus) if log_values: new_affinity = _dense_list_to_spans(process.cpu_affinity()) logger.info('Old CPU affinity: {}'.format(old_affinity)) logger.info('New CPU affinity: {}'.format(new_affinity)) nvmlShutdown() except Exception as e: logger.warning( "Failed to set the process affinity due to error: {}".format(e))
def memory_status(msg="", reset_max=True, sync=True): rank = smp.rank() tp_rank = smp.tp_rank() pp_rank = smp.pp_rank() rdp_rank = smp.rdp_rank() local_rank = smp.local_rank() if sync: torch.cuda.synchronize() if rdp_rank != 0: return if py3nvml != None: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex(local_rank) info = py3nvml.nvmlDeviceGetMemoryInfo(handle) total_used = info.used / 1024**3 total_used_str = f"Totally used GPU memory: {total_used}" else: total_used_str = "" alloced = torch.cuda.memory_allocated(device=local_rank) max_alloced = torch.cuda.max_memory_allocated(device=local_rank) cached = torch.cuda.memory_reserved(device=local_rank) max_cached = torch.cuda.max_memory_reserved(device=local_rank) # convert to GB for printing alloced /= 1024**3 cached /= 1024**3 max_alloced /= 1024**3 max_cached /= 1024**3 print( f'[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}', f'device={local_rank} ' f'alloc {alloced:0.4f} max_alloced {max_alloced:0.4f} ' f'cache {cached:0.4f} max_cached {max_cached:0.4f} ' f'{total_used_str}') if reset_max: torch.cuda.reset_max_memory_cached() torch.cuda.reset_max_memory_allocated() if py3nvml != None: py3nvml.nvmlShutdown()
def _trace_lines(frame, event, arg): if event != 'line': return if EMPTY_CACHE: torch.cuda.empty_cache() co = frame.f_code func_name = co.co_name line_no = frame.f_lineno filename = co.co_filename py3nvml.nvmlInit() mem_used = _get_gpu_mem_used() where_str = f"{func_name} in {filename}:{line_no}" with open(gpu_profile_fn, 'a+') as f: f.write(f"{where_str} --> {mem_used:<7.1f}Mb\n") if PRINT_TENSOR_SIZES: _print_tensors(f, where_str) py3nvml.nvmlShutdown()
def __get_gpu_temps(): if utils.which('nvidia-smi') is not None: try: nvml.nvmlInit() except nvml.NVMLError as e: pass else: device_count = nvml.nvmlDeviceGetCount() print('\nGPU:') if device_count > 0: for i in range(device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) gpu_temp = nvml.nvmlDeviceGetTemperature(handle, 0) print(' GPU %(i)s: ${alignr}${color %(color)s}%(temp)s${color}°C' % { 'i': i, 'color': get_gpu_temps_color(gpu_temp), 'temp': gpu_temp }) nvml.nvmlShutdown()
def list_processes_on_nvidia(): nvml.nvmlInit() gpu_handle = nvml.nvmlDeviceGetHandleByIndex(0) proc_list = nvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle) result = [] for p_nvml in proc_list: p_psutil = psutil.Process(p_nvml.pid) cmdline = p_psutil.cmdline() result.append({ "pid": p_nvml.pid, "cmdline": cmdline[0] if len(cmdline) > 0 else "" }) nvml.nvmlShutdown() return result
def get_num_procs(): """ Gets the number of processes running on each gpu Returns ------- num_procs : list(int) Number of processes running on each gpu Note ---- If function can't query the driver will return an empty list rather than raise an Exception. Note ---- If function can't get the info from the gpu will return -1 in that gpu's place """ # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warning(str_) return [] num_gpus = py3nvml.nvmlDeviceGetCount() gpu_procs = [-1] * num_gpus for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) gpu_procs[i] = len(procs) py3nvml.nvmlShutdown() return gpu_procs
def get_device_procs(self, device_id: int) -> Optional[List[ProcInfo]]: """ List processes running on the GPU. Parameters ---------- device_id : int Device identifier Returns ------- Optional[List[ProcInfo]] List of ProcInfo named tuples (name, pid, mem fields) Raises ------ RuntimeError In case of py3nvml failure. """ py3nvml.nvmlInit() dev_count = py3nvml.nvmlDeviceGetCount() # type: int if not (0 <= device_id < dev_count): raise RuntimeError('Failed to query GPU with nvml') handle = py3nvml.nvmlDeviceGetHandleByIndex(device_id) result = [] try: for proc in py3nvml.nvmlDeviceGetComputeRunningProcesses(handle): try: name = str(py3nvml.nvmlSystemGetProcessName(proc.pid)) except py3nvml.NVMLError as err: if (err.value == py3nvml.NVML_ERROR_NOT_FOUND): # exited? continue raise mem = proc.usedGpuMemory / 1024 / 1024 result.append(ProcInfo(name, proc.pid, mem)) finally: py3nvml.nvmlShutdown() return result
def get_gpu_info() -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) try: nvmlInit() result = [] device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): return None for i in range(device_count): info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): return None result.append({ "id": i, "name": nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)), "total": info.total, "free": info.free, "used": info.used, }) nvmlShutdown() return result except NVMLError as error: print("Error fetching GPU information using nvml: %s", error) return None
def _get_gpu_info(): pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() all_info = [] for i in range(0, deviceCount): gpu = pynvml.nvmlDeviceGetHandleByIndex(i) dimensions = {} dimensions.update(Nvidia._get_driver_version()) dimensions.update(Nvidia._get_device_uuid(gpu)) dimensions.update(Nvidia._get_info_rom_image_version(gpu)) dimensions.update(Nvidia._get_device_power_state(gpu)) dimensions.update(Nvidia._get_device_vbios_version(gpu)) measurements = {} measurements.update(Nvidia._get_fan_speed_percent(gpu)) measurements.update(Nvidia._get_framebuffer_memory_stats(gpu)) measurements.update(Nvidia._get_bar1_memory_stats(gpu)) measurements.update(Nvidia._get_utilisation_stats(gpu)) measurements.update(Nvidia._get_device_temperature(gpu)) measurements.update(Nvidia._get_device_shutdown_temp(gpu)) measurements.update(Nvidia._get_device_slowdown_temp(gpu)) measurements.update(Nvidia._get_power_usage_watts(gpu)) measurements.update(Nvidia._get_power_limit_watts(gpu)) measurements.update(Nvidia._get_clock_info(gpu)) measurements.update(Nvidia._get_clock_max_info(gpu)) gpu_name = "{}_{}".format( Nvidia._get_device_name(gpu).get('name'), Nvidia._get_device_serial(gpu).get('serial')) gpu_info = { 'name': gpu_name, 'dimensions': dimensions, 'measurements': measurements } all_info.append(gpu_info) pynvml.nvmlShutdown() return all_info
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]: logger.info("Note that Tensorflow allocates more memory than" "it might need to speed up computation." "The memory reported here corresponds to the memory" "reported by `nvidia-smi`, which can vary depending" "on total available memory on the GPU that is used.") with self.args.strategy.scope(): try: if self.args.trace_memory_line_by_line: assert ( self.args.eager_mode ), "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory consumption line by line." trace = start_memory_tracing("transformers") if self.args.is_tpu: # tpu raise NotImplementedError( "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`" ) elif self.args.is_gpu: # gpu if not is_py3nvml_available(): logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) memory = "N/A" else: logger.info( "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU." ) # init nvml nvml.nvmlInit() func() handle = nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: # cpu if self.args.trace_memory_line_by_line: logger.info( "When enabling line by line tracing, the max peak memory for CPU is inaccurate in Tensorflow." ) memory = None else: memory_bytes = measure_peak_memory_cpu(func) memory = Memory(memory_bytes) if isinstance( memory_bytes, int) else memory_bytes if self.args.trace_memory_line_by_line: summary = stop_memory_tracing(trace) if memory is None: memory = summary.total else: summary = None return memory, summary except ResourceExhaustedError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) return "N/A", None
def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=1.0): """ Checks for gpu availability and sets CUDA_VISIBLE_DEVICES as such. Note that this function does not do anything to 'reserve' gpus, it only limits what GPUS your program can see by altering the CUDA_VISIBLE_DEVICES variable. Other programs can still come along and snatch your gpu. This function is more about preventing **you** from stealing someone else's GPU. If more than 1 GPU is requested but the full amount are available, then it will set the CUDA_VISIBLE_DEVICES variable to see all the available GPUs. A warning is generated in this case. If one or more GPUs were requested and none were available, a Warning will be raised. Before raising it, the CUDA_VISIBLE_DEVICES will be set to a blank string. This means the calling function can ignore this warning and proceed if it chooses to only use the CPU, and it should still be protected against putting processes on a busy GPU. You can call this function with num_gpus=0 to blank out the CUDA_VISIBLE_DEVICES environment variable. Parameters ---------- num_gpus : int How many gpus your job needs (optional) gpu_select : iterable A single int or an iterable of ints indicating gpu numbers to search through. If left blank, will search through all gpus. gpu_fraction : float The fractional of a gpu memory that must be free for the script to see the gpu as free. Defaults to 1. Useful if someone has grabbed a tiny amount of memory on a gpu but isn't using it. Returns ------- success : int Number of gpus 'grabbed' Raises ------ RuntimeWarning If couldn't connect with NVIDIA drivers. If 1 or more gpus were requested and none were available. ValueError If the gpu_select option was not understood (can fix by leaving this field blank, providing an int or an iterable of ints). """ # Set the visible devices to blank. os.environ['CUDA_VISIBLE_DEVICES'] = "" if num_gpus == 0: return 0 # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly. Proceeding on cpu only...""" warnings.warn(str_, RuntimeWarning) logger.warn(str_) return 0 numDevices = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * numDevices # Flag which gpus we can check if gpu_select is None: gpu_check = [True] * 8 else: gpu_check = [False] * 8 try: gpu_check[gpu_select] = True except TypeError: try: for i in gpu_select: gpu_check[i] = True except: raise ValueError( '''Please provide an int or an iterable of ints for gpu_select''') # Print out GPU device info. Useful for debugging. for i in range(numDevices): # If the gpu was specified, examine it if not gpu_check[i]: continue handle = py3nvml.nvmlDeviceGetHandleByIndex(i) info = py3nvml.nvmlDeviceGetMemoryInfo(handle) str_ = "GPU {}:\t".format(i) + \ "Used Mem: {:>6}MB\t".format(info.used/(1024*1024)) + \ "Total Mem: {:>6}MB".format(info.total/(1024*1024)) logger.debug(str_) # Now check if any devices are suitable for i in range(numDevices): # If the gpu was specified, examine it if not gpu_check[i]: continue handle = py3nvml.nvmlDeviceGetHandleByIndex(i) info = py3nvml.nvmlDeviceGetMemoryInfo(handle) # Sometimes GPU has a few MB used when it is actually free if (info.free + 10) / info.total >= gpu_fraction: gpu_free[i] = True else: logger.info('GPU {} has processes on it. Skipping.'.format(i)) py3nvml.nvmlShutdown() # Now check whether we can create the session if sum(gpu_free) == 0: warnings.warn("Could not find enough GPUs for your job", RuntimeWarning) logger.warn(str_) return 0 else: if sum(gpu_free) >= num_gpus: # only use the first num_gpus gpus. Hide the rest from greedy # tensorflow available_gpus = [i for i, x in enumerate(gpu_free) if x] use_gpus = ','.join(list( str(s) for s in available_gpus[:num_gpus])) logger.debug('{} Gpus found free'.format(sum(gpu_free))) logger.info('Using {}'.format(use_gpus)) os.environ['CUDA_VISIBLE_DEVICES'] = use_gpus return num_gpus else: # use everything we can. s = "Only {} GPUs found but {}".format(sum(gpu_free), num_gpus) + \ "requested. Allocating these and continuing." warnings.warn(s, RuntimeWarning) logger.warn(s) available_gpus = [i for i, x in enumerate(gpu_free) if x] use_gpus = ','.join(list(str(s) for s in available_gpus)) logger.debug('{} Gpus found free'.format(sum(gpu_free))) logger.info('Using {}'.format(use_gpus)) os.environ['CUDA_VISIBLE_DEVICES'] = use_gpus return sum(gpu_free)
def traceit(frame, event, args): """ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list with debugging information """ global _is_memory_tracing_enabled if not _is_memory_tracing_enabled: return traceit # Filter events if events_to_trace is not None: if isinstance(events_to_trace, str) and event != events_to_trace: return traceit elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace: return traceit if "__name__" not in frame.f_globals: return traceit # Filter modules name = frame.f_globals["__name__"] if not isinstance(name, str): return traceit else: # Filter whitelist of modules to trace if modules_to_trace is not None: if isinstance(modules_to_trace, str) and modules_to_trace not in name: return traceit elif isinstance(modules_to_trace, (list, tuple)) and all( m not in name for m in modules_to_trace): return traceit # Filter blacklist of modules not to trace if modules_not_to_trace is not None: if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name: return traceit elif isinstance(modules_not_to_trace, (list, tuple)) and any( m in name for m in modules_not_to_trace): return traceit # Record current tracing state (file, location in file...) lineno = frame.f_lineno filename = frame.f_globals["__file__"] if filename.endswith(".pyc") or filename.endswith(".pyo"): filename = filename[:-1] line = linecache.getline(filename, lineno).rstrip() traced_state = Frame(filename, name, lineno, event, line) # Record current memory state (rss memory) and compute difference with previous memory state cpu_mem = 0 if process is not None: mem = process.memory_info() cpu_mem = mem.rss gpu_mem = 0 if log_gpu: # Clear GPU caches if is_torch_available(): torch_empty_cache() if is_tf_available(): tf_context.context()._clear_caches( ) # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802 # Sum used memory for all GPUs nvml.nvmlInit() for i in devices: handle = nvml.nvmlDeviceGetHandleByIndex(i) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used nvml.nvmlShutdown() mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem) memory_trace.append(mem_state) return traceit