def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework if self.framework == "PyTorch": info["use_torchscript"] = self.args.torchscript if self.framework == "TensorFlow": info["eager_mode"] = self.args.eager_mode info["use_xla"] = self.args.use_xla info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self.args.fp16 info["use_multiprocessing"] = self.args.do_multi_processing info["only_pretrain_model"] = self.args.only_pretrain_model if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory. " "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self.args.is_gpu if self.args.is_gpu: info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" info["use_tpu"] = self.args.is_tpu # TODO(PVP): See if we can add more information about TPU # see: https://github.com/pytorch/xla/issues/2180 self._environment_info = info return self._environment_info
def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def environment_info(self): if self._environment_info is None: info = {} info["gluonnlp_version"] = gluonnlp.__version__ info["framework_version"] = mxnet.__version__ info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self._use_fp16 if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self._use_gpu if self._use_gpu: info["num_gpus"] = 1 if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState( handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" self._environment_info = info return self._environment_info
def get_device_power_limit(self, idx): """Get the power management limit of device, unit: watt. Args: idx (int): device index. Return: temp (float): the power management limit of device, None means failed to get the data. """ try: powerlimit = nvml.nvmlDeviceGetPowerManagementLimit( self._device_handlers[idx]) except Exception as err: logger.error('Get device power limitation failed: {}'.format( str(err))) return None return int(int(powerlimit) / 1000)
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) try: import psutil except (ImportError): logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" else: info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) info["use_gpu"] = self.is_gpu if self.is_gpu: info["num_gpus"] = self.args.n_gpu try: from py3nvml import py3nvml py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) except ImportError: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" except (OSError, py3nvml.NVMLError): logger.warning( "Error while initializing comunication with GPU. " "We won't log information about GPU.") info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" py3nvml.nvmlShutdown() else: info["gpu"] = py3nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( py3nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState( handle) py3nvml.nvmlShutdown() self._environment_info = info return self._environment_info
def _get_power_limit_watts(gpu): return { 'power_limit_watts': (pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0) }
def getGpuInfo(self): if (self._impulse % 2) != 0: return self._gpuInfoObj try: N.nvmlInit() gpuInfoObj = {} driverVersion = N.nvmlSystemGetDriverVersion() deviceCnt = N.nvmlDeviceGetCount() gpuInfoObj['DRIVER_VERSION'] = driverVersion gpuInfoObj['DEVICE_COUNT'] = deviceCnt for dCnt in range(deviceCnt): deviceInfoObj = {} handle = N.nvmlDeviceGetHandleByIndex(dCnt) name = N.nvmlDeviceGetName(handle) try: fan = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as err: fan = 'N/A' try: temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError as err: temp = 'N/A' try: powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000) except N.NVMLError as err: powerUsage = 'N/A' try: powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000) except N.NVMLError as err: powerLimit = 'N/A' try: memInfo = N.nvmlDeviceGetMemoryInfo(handle) memUsage = round(memInfo.used/1024/1024) memTotal = round(memInfo.total/1024/1024) except N.NVMLError as err: memUsage = 'N/A' memTotal = 'N/A' try: util = N.nvmlDeviceGetUtilizationRates(handle).gpu except N.NVMLError as err: util = 'N/A' deviceInfoObj['NAME'] = name deviceInfoObj['FAN'] = fan deviceInfoObj['TEMP'] = temp deviceInfoObj['POWER_USAGE'] = powerUsage deviceInfoObj['POWER_LIMIT'] = powerLimit deviceInfoObj['MEM_USAGE'] = memUsage deviceInfoObj['MEM_TOTAL'] = memTotal deviceInfoObj['UTIL'] = util gpuProcessObj = {} try: processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError as err: processes = [] for pCnt, process in enumerate(processes): gpuMem = round(process.usedGpuMemory / 1024 / 1024) pid = process.pid try: p = psutil.Process(pid) attrs = p.as_dict(attrs = ['name', 'username', 'status']) except psutil.ZombieProcess: attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'} except: pass gpuProcessObj[str(pCnt)] = { 'PID': pid, 'MEM': gpuMem, 'NAME': attrs['name'], 'USERNAME': self._getSubuidName(attrs['username']), 'STATUS': attrs['status'] } deviceInfoObj['PROCESS'] = gpuProcessObj gpuInfoObj[str(dCnt)] = deviceInfoObj N.nvmlShutdown() except N.NVMLError as err: N.nvmlShutdown() print(err) gpuInfoObj = {} self._gpuInfoObj = gpuInfoObj return gpuInfoObj