def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            if self.framework == "PyTorch":
                info["use_torchscript"] = self.args.torchscript
            if self.framework == "TensorFlow":
                info["eager_mode"] = self.args.eager_mode
                info["use_xla"] = self.args.use_xla
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self.args.fp16
            info["use_multiprocessing"] = self.args.do_multi_processing
            info["only_pretrain_model"] = self.args.only_pretrain_model

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory. "
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self.args.is_gpu
            if self.args.is_gpu:
                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"

            info["use_tpu"] = self.args.is_tpu
            # TODO(PVP): See if we can add more information about TPU
            # see: https://github.com/pytorch/xla/issues/2180

            self._environment_info = info
        return self._environment_info
Beispiel #2
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["gluonnlp_version"] = gluonnlp.__version__
            info["framework_version"] = mxnet.__version__
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self._use_fp16

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self._use_gpu
            if self._use_gpu:
                info["num_gpus"] = 1
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
            self._environment_info = info
        return self._environment_info
Beispiel #4
0
    def get_device_power_limit(self, idx):
        """Get the power management limit of device, unit: watt.

        Args:
            idx (int): device index.

        Return:
            temp (float): the power management limit of device, None means failed to get the data.
        """
        try:
            powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(
                self._device_handlers[idx])
        except Exception as err:
            logger.error('Get device power limitation failed: {}'.format(
                str(err)))
            return None
        return int(int(powerlimit) / 1000)
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())

            try:
                import psutil
            except (ImportError):
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"
            else:
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)

            info["use_gpu"] = self.is_gpu
            if self.is_gpu:
                info["num_gpus"] = self.args.n_gpu
                try:
                    from py3nvml import py3nvml

                    py3nvml.nvmlInit()
                    handle = py3nvml.nvmlDeviceGetHandleByIndex(
                        self.args.device_idx)
                except ImportError:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
                except (OSError, py3nvml.NVMLError):
                    logger.warning(
                        "Error while initializing comunication with GPU. "
                        "We won't log information about GPU.")
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
                    py3nvml.nvmlShutdown()
                else:
                    info["gpu"] = py3nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        py3nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    py3nvml.nvmlShutdown()

            self._environment_info = info
        return self._environment_info
 def _get_power_limit_watts(gpu):
     return {
         'power_limit_watts':
         (pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0)
     }
Beispiel #7
0
    def getGpuInfo(self):
        if (self._impulse % 2) != 0:
            return self._gpuInfoObj

        try:
            N.nvmlInit()
            gpuInfoObj = {}

            driverVersion = N.nvmlSystemGetDriverVersion()
            deviceCnt = N.nvmlDeviceGetCount()

            gpuInfoObj['DRIVER_VERSION'] = driverVersion
            gpuInfoObj['DEVICE_COUNT'] = deviceCnt

            for dCnt in range(deviceCnt):
                deviceInfoObj = {}
                handle = N.nvmlDeviceGetHandleByIndex(dCnt)
                name = N.nvmlDeviceGetName(handle)

                try:
                    fan = N.nvmlDeviceGetFanSpeed(handle)
                except N.NVMLError as err:
                    fan = 'N/A'

                try:
                    temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
                except N.NVMLError as err:
                    temp = 'N/A'

                try:
                    powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000)
                except N.NVMLError as err:
                    powerUsage = 'N/A'

                try:
                    powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                except N.NVMLError as err:
                    powerLimit = 'N/A'

                try:
                    memInfo = N.nvmlDeviceGetMemoryInfo(handle)
                    memUsage = round(memInfo.used/1024/1024)
                    memTotal = round(memInfo.total/1024/1024)
                except N.NVMLError as err:
                    memUsage = 'N/A'
                    memTotal = 'N/A'

                try:
                    util = N.nvmlDeviceGetUtilizationRates(handle).gpu
                except N.NVMLError as err:
                    util = 'N/A'

                deviceInfoObj['NAME'] = name
                deviceInfoObj['FAN'] = fan
                deviceInfoObj['TEMP'] = temp
                deviceInfoObj['POWER_USAGE'] = powerUsage
                deviceInfoObj['POWER_LIMIT'] = powerLimit
                deviceInfoObj['MEM_USAGE'] = memUsage
                deviceInfoObj['MEM_TOTAL'] = memTotal
                deviceInfoObj['UTIL'] = util

                gpuProcessObj = {}
                try:
                    processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                except N.NVMLError as err:
                    processes = []
                for pCnt, process in enumerate(processes):
                    gpuMem = round(process.usedGpuMemory / 1024 / 1024)
                    pid = process.pid

                    try:
                        p = psutil.Process(pid)
                        attrs = p.as_dict(attrs = ['name', 'username', 'status'])
                    except psutil.ZombieProcess:
                        attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'}
                    except:
                        pass
                    
                    gpuProcessObj[str(pCnt)] = {
                        'PID': pid,
                        'MEM': gpuMem,
                        'NAME': attrs['name'],
                        'USERNAME': self._getSubuidName(attrs['username']),
                        'STATUS': attrs['status']
                    }

                deviceInfoObj['PROCESS'] = gpuProcessObj
                gpuInfoObj[str(dCnt)] = deviceInfoObj

            N.nvmlShutdown()

        except N.NVMLError as err:
            N.nvmlShutdown()
            print(err)
            gpuInfoObj = {}

        self._gpuInfoObj = gpuInfoObj
        return gpuInfoObj