def require_nvml(*args, **kwargs): try: from py3nvml import py3nvml py3nvml.nvmlInit() return True except: return False
def get_free_gpus(): """ For an N gpu system, returns a list of N boolean values. The nth value will be True if no process was running on the nth gpu.""" # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warn(str_) return [] numDevices = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * numDevices num_gpus = py3nvml.nvmlDeviceGetCount() for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) if len(procs) == 0: gpu_free[i] = True return gpu_free
def gpu_profile(frame, event, arg): # it is _about to_ execute (!) global last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: # about _previous_ line (!) if lineno is not None: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ['GPU_DEBUG'])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ':' + str(lineno) with open(gpu_profile_fn, 'a+') as f: f.write(f"{where_str:<50}" f":{meminfo.used/1024**2:<7.1f}Mb " f"{line.rstrip()}\n") if print_tensor_sizes is True: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() # save details about line _to be_ executed lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] if (filename.endswith(".pyc") or filename.endswith(".pyo")): filename = filename[:-1] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'gmwda-pytorch' not in os.path.dirname( os.path.abspath(filename)): lineno = None # skip current line evaluation if ('car_datasets' in filename or '_exec_config' in func_name or 'gpu_profile' in module_name or 'tee_stdout' in module_name): lineno = None # skip current return gpu_profile except (KeyError, AttributeError) as e: print(e) return gpu_profile
def measure_gpu_usage(self): from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \ nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError max_gpu_usage = [] gpu_name = [] try: nvmlInit() deviceCount = nvmlDeviceGetCount() max_gpu_usage = [0 for i in range(deviceCount)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount) ] while True: for i in range(deviceCount): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i] } for i in range(deviceCount)] except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None
def inference_speed_memory(self, batch_size, seq_length): # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) @jax.jit def ref_step(): out = self.model(input_ids=input_ids) return out[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() ref_step().block_until_ready() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals()) else: with jax.disable_jit(): runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def __init__(self): """Constructor.""" nvml.nvmlInit() self._device_count = self.get_device_count() self._device_handlers = list() for i in range(self._device_count): self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
def get_gpu_info_by_nvml(self) -> Dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None try: nvmlInit() driver_version = nvmlSystemGetDriverVersion() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) gpu_info = {} gpu_info["memory_total"] = info.total gpu_info["memory_available"] = info.free gpu_info["name"] = nvmlDeviceGetName(handle) gpu_info_list.append(gpu_info) nvmlShutdown() except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None result = {"driver_version": driver_version, "devices": gpu_info_list} if 'CUDA_VISIBLE_DEVICES' in environ: result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES'] return result
def main(): args = parse_args() pool_collector = None urllib3.disable_warnings() nvml.nvmlInit() atexit.register(nvml.nvmlShutdown) REGISTRY.register(NvidiaCollector()) if args['pool'] is not None: pool_collector = pool_collectors()[args['pool'].lower()]( args['pool_api_host'], args['pool_api_miner']) pool_collector.query_pool() REGISTRY.register(pool_collector) if args['miner'] is not None: REGISTRY.register(miner_collectors()[args['miner'].lower()]( args['miner_api_host'], args['miner_api_port'])) print('Starting exporter...') try: start_http_server(args['port']) while True: time.sleep( 60) # 1 query per minute so we don't reach API request limits if pool_collector is not None: pool_collector.query_pool() except KeyboardInterrupt: print('Exiting...') exit(0)
def getCUDAEnvironment(): """ Get the CUDA runtime environment parameters (number of cards etc.). """ rdict = dict() rdict['first_available_device_index'] = None rdict['device_count'] = 0 try: nvml.nvmlInit() rdict['device_count'] = nvml.nvmlDeviceGetCount() except Exception: print( 'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.' ) return rdict for i in range(rdict['device_count']): memory_info = nvml.nvmlDeviceGetMemoryInfo( nvml.nvmlDeviceGetHandleByIndex(i)) memory_usage_percentage = memory_info.used / memory_info.total if memory_usage_percentage <= 0.1: rdict['first_available_device_index'] = i break nvml.nvmlShutdown() return rdict
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework if self.framework == "PyTorch": info["use_torchscript"] = self.args.torchscript if self.framework == "TensorFlow": info["eager_mode"] = self.args.eager_mode info["use_xla"] = self.args.use_xla info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self.args.fp16 info["use_multiprocessing"] = self.args.do_multi_processing info["only_pretrain_model"] = self.args.only_pretrain_model if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory. " "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self.args.is_gpu if self.args.is_gpu: info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" info["use_tpu"] = self.args.is_tpu # TODO(PVP): See if we can add more information about TPU # see: https://github.com/pytorch/xla/issues/2180 self._environment_info = info return self._environment_info
def gpu_profile(frame, event): global last_meminfo_used, last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: if lineno: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ["GPU_DEBUG"])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ' ' + str(lineno) new_meminfo_used = meminfo.used mem_display = new_meminfo_used - last_meminfo_used if use_incremental else new_meminfo_used with open(gpu_profile_fn, "a+") as f: f.write(f"{where_str:<50}" f":{(mem_display) / 1024 ** 2:<7.1f}Mb " f"{line.rstrip()}\n") last_meminfo_used = new_meminfo_used if print_tensor_sizes: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'Beta' not in os.path.dirname(os.path.abspath(filename)): lineno = None return gpu_profile except (KeyError, AttributeError): pass return gpu_profile
def __init__(self, report=None, devices=None, quiet=False, always_suffix=False, output=print, verbose_once=True): super(self.__class__, self).__init__() global nvml self.output = output if nvml is not None: try: nvml.nvmlInit() except (OSError, nvml.NVMLError_LibraryNotFound): # the python library might be installed, but not the drivers... nvml = None if nvml is None: if not quiet: self.output( "Could not load py3nvml, cannot report any nvidia device statistics." ) report = [] else: device_count = nvml.nvmlDeviceGetCount() if devices is None: devices = list(range(device_count)) else: devices = [ int(device) for device in devices if 0 <= int(device) < device_count ] self.devices = devices self.deviceHandles = [ nvml.nvmlDeviceGetHandleByIndex(device) for device in devices ] if not quiet: for n, handle in enumerate(self.deviceHandles): self.output("Collecting statistics for device #% 2d: %s" % (n, nvml.nvmlDeviceGetName(handle))) if report is None: report = ['temperature', 'utilization_gpu'] elif report == 'all': report = list(self.reportable_values.keys()) self.verbose_once = verbose_once self.report = report self.always_suffix = always_suffix
def __init__(self, index=None): self.no_gpu = False if not torch.cuda.is_available(): print("GPU not found. GPU stats not available") self.no_gpu = True return nvmlInit() self._finalizer = weakref.finalize(self, nvmlShutdown) self.handle = _torch_gpu_index_to_nvml_handle(index)
def nvml_manager(): """Context manager to initialise and shut down NVML.""" global pynvml if pynvml: try: pynvml.nvmlInit() except pynvml.NVMLError as error: print('Warning:', error, file=sys.stderr) pynvml = None yield pynvml if pynvml: pynvml.nvmlShutdown()
def check_has_nvml() -> bool: """ Determines if libnvml is available on the system. True means CUDA is available :rtype: bool """ try: nvmlInit() return True except NVMLError: return False
def __nvml_get_or_else(self, getter, default=None): try: nvmlInit() return getter() except NVMLError as e: timestamp = time.time() if timestamp - GPUMonitor.nvml_error_time > GPUMonitor.nvml_error_period: _logger.warning( "NVMLError: %s - GPU usage metrics may not be reported.", e) GPUMonitor.nvml_error_time = timestamp return default
def gpu_info(): "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)" nvmlInit() deviceCount = nvmlDeviceGetCount() info = [] for i in range(0, deviceCount): handle = nvmlDeviceGetHandleByIndex(i) util = nvmlDeviceGetUtilizationRates(handle) desc = nvmlDeviceGetName(handle) info.append( (i, desc, util.gpu)) #['GPU %i - %s' % (i, desc)] = util.gpu return info
def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) max_gpu_usage = [] gpu_name = [] try: nvmlInit() device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): logger.error( f"nvmlDeviceGetCount result is not integer: {device_count}" ) return None max_gpu_usage = [0 for i in range(device_count)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(device_count) ] while True: for i in range(device_count): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): logger.error( f"nvmlDeviceGetMemoryInfo returns str: {info}") return None max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i], } for i in range(device_count)] except NVMLError as error: logger.error("Error fetching GPU information using nvml: %s", error) return None
def environment_info(self): if self._environment_info is None: info = {} info["gluonnlp_version"] = gluonnlp.__version__ info["framework_version"] = mxnet.__version__ info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self._use_fp16 if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self._use_gpu if self._use_gpu: info["num_gpus"] = 1 if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState( handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" self._environment_info = info return self._environment_info
def init_nvidia(self): """Init the NVIDIA API.""" if import_error_tag: self.nvml_ready = False try: pynvml.nvmlInit() self.device_handles = get_device_handles() self.nvml_ready = True except Exception: logger.debug("pynvml could not be initialized.") self.nvml_ready = False return self.nvml_ready
def __init__(self, interval=20): try: nvmlInit() self.gpu = True except: self.gpu = False self.interval = interval self.schedule = sched.scheduler(time.time, time.sleep) self.powers = [] self.thread = Thread(target=self._get_power_period, name="powermeter_thread") self.thread.setDaemon(True) self.thread.start() self.sum = 0
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]: """ Get driver version and list of ``GpuInfo``, if available. """ try: nvml.nvmlInit() except nvml.NVMLError: # Not available. return None, None driver_version: str = nvml.nvmlSystemGetDriverVersion() gpus: List[GpuInfo] = [] device_count: int = nvml.nvmlDeviceGetCount() for i in range(device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) name = try_get_info(nvml.nvmlDeviceGetName, handle) fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0) temp = try_get_info( lambda h: nvml.nvmlDeviceGetTemperature(h, nvml. NVML_TEMPERATURE_GPU), handle, default=0, ) mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) if mem_info: mem_used = mem_info.used >> 20 mem_total = mem_info.total >> 20 else: mem_used = 0 mem_total = 0 util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) if util: gpu_util = util.gpu else: gpu_util = 0 gpus.append( GpuInfo( id=i, name=name, mem_usage=mem_used, mem_capacity=mem_total, utilization=gpu_util, temp=temp, fan=fan_speed, )) nvml.nvmlShutdown() return driver_version, gpus
def train_speed_memory(self, batch_size, seq_length): key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) labels = jax.random.randint(key, (batch_size, seq_length), 0, 2) # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # labels = np.random.randint(0,2, (batch_size, seq_length)) @jax.jit def train_step(): def loss_fn(params): token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype) logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0] loss, normalizing_factor = cross_entropy(logits,targets, token_mask) jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu") return loss / normalizing_factor if self.fp16 and jax.local_devices()[0].platform == 'gpu': grad_fn = self.dynamic_scale.value_and_grad(loss_fn) dyn_scale, is_fin, loss, grad = grad_fn(self.model.params) else: grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(self.model.params) return tree_flatten(grad)[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() train_step() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None # timeit.repeat(train_step,repeat=1,number=2) timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) else: with jax.disable_jit(): # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def run_gpu_mem_counter(do_shutdown=False): # Sum used memory for all GPUs if not torch.cuda.is_available(): return 0 if do_shutdown: py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount()) ) #if gpus_to_trace is None else gpus_to_trace gpu_mem = 0 for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used if do_shutdown: py3nvml.nvmlShutdown() return gpu_mem
def test_nvidia(): # pip install py3nvml import py3nvml from py3nvml import py3nvml as nvml inspect(py3nvml.get_free_gpus()) nvml.nvmlInit() inspect(version=nvml.nvmlSystemGetDriverVersion()) inspect(count=nvml.nvmlDeviceGetCount()) for i in range(nvml.nvmlDeviceGetCount()): test_nvidia_device(i) nvml.nvmlShutdown()
def run(self): # # Initialize nvml on the thread. # nvml.nvmlInit() t0 = time.time() while not self.shutdown: dt = int(time.time() - t0) mem_used, mem_total, gpu_util = query_gpu(self.gpu_index) self._monitor_callback(dt, mem_used, mem_total, gpu_util) time.sleep(self.sampling_period)
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]: try: if self.args.trace_memory_line_by_line: trace = start_memory_tracing("transformers") if self.args.is_tpu: # tpu raise NotImplementedError( "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no-memory` or `args.memory=False`" ) elif self.args.is_gpu: if not is_py3nvml_available(): logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) memory = "N/A" else: logger.info( "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU." ) # init nvml nvml.nvmlInit() func() handle = nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: # cpu memory_bytes = measure_peak_memory_cpu(func) memory = Memory(memory_bytes) if isinstance( memory_bytes, int) else memory_bytes if self.args.trace_memory_line_by_line: summary = stop_memory_tracing(trace) else: summary = None return memory, summary except RuntimeError as e: self.print_fn(f"Doesn't fit on GPU. {e}") return "N/A", None
def get_free_gpus(max_procs=0): """ Checks the number of processes running on your GPUs. Parameters ---------- max_procs : int Maximum number of procs allowed to run on a gpu for it to be considered 'available' Returns ------- availabilities : list(bool) List of length N for an N-gpu system. The nth value will be true, if the nth gpu had at most max_procs processes running on it. Set to 0 to look for gpus with no procs on it. Note ---- If function can't query the driver will return an empty list rather than raise an Exception. """ # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warning(str_) return [] num_gpus = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * num_gpus for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) if len(procs) <= max_procs: gpu_free[i] = True py3nvml.nvmlShutdown() return gpu_free
def set_affinity(rank, log_values=False): """ Sets the CPU affinity for the current process to be local to its respective GPU. NOTE: `rank` is considered to be equivalent to the GPU index. Certain systems have complex hardware topologies (such as our systems in Cosmos and AVDC), and as such, there are certain CPU cores that have faster interconnects with certain GPU cards. This function will force linux to only schedule the current process to run on CPU cores that are fast for the associated GPU. Args: rank (int): The rank of the current process (and GPU index). log_values (bool): Optionally log the before and after values. """ try: num_cpus = os.cpu_count() nvmlInit() rank_cpus = [] # nvmlSystemGetTopologyGpuSet prints the number of GPUs each time it's # called, so this will suppress those prints with IgnorePrint(): for i in range(num_cpus): for d in nvmlSystemGetTopologyGpuSet(i): d_index = nvmlDeviceGetIndex(d) if d_index == rank: rank_cpus.append(i) break process = psutil.Process() old_affinity = _dense_list_to_spans(process.cpu_affinity()) process.cpu_affinity(rank_cpus) if log_values: new_affinity = _dense_list_to_spans(process.cpu_affinity()) logger.info('Old CPU affinity: {}'.format(old_affinity)) logger.info('New CPU affinity: {}'.format(new_affinity)) nvmlShutdown() except Exception as e: logger.warning( "Failed to set the process affinity due to error: {}".format(e))
def main(): args = parse_args() urllib3.disable_warnings() nvml.nvmlInit() atexit.register(nvml.nvmlShutdown) REGISTRY.register(NvidiaCollector()) print('Starting exporter...') try: start_http_server(args['port']) while True: time.sleep( 60) # 1 query per minute so we don't reach API request limits except KeyboardInterrupt: print('Exiting...') exit(0)