Example #1
0
def gpu_info() -> dict:
    info = dict()

    try:
        nvmlInit()
    except NVMLError:
        info['no-gpu'] = 'No Nvidia GPU detected'
        return info

    device_count = nvmlDeviceGetCount()

    info['driver_version'] = nvmlSystemGetDriverVersion().decode()
    info['device_count'] = device_count
    info['device'] = dict()
    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        memory = nvmlDeviceGetMemoryInfo(handle)

        info['device'][i] = dict()
        info['device'][i]['name'] = str(nvmlDeviceGetName(handle))

        info['device'][i]['memory'] = dict()

        info['device'][i]['memory']['total'] = str(size_in_gb(memory.total))

    nvmlShutdown()

    return info
Example #2
0
def getGPUUsage():
    try:
        pynvml.nvmlInit()
        count = pynvml.nvmlDeviceGetCount()
        if count == 0:
            return None

        result = {
            "driver": pynvml.nvmlSystemGetDriverVersion(),
            "gpu_count": int(count)
        }
        i = 0
        gpuData = []
        while i < count:
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
            gpuData.append({
                "device_num": i,
                "name": pynvml.nvmlDeviceGetName(handle),
                "total": round(float(mem.total) / 1000000000, 2),
                "used": round(float(mem.used) / 1000000000, 2)
            })
            i = i + 1

        result["devices"] = jsonpickle.encode(gpuData, unpicklable=False)
    except Exception as e:
        result = {"driver": "No GPU!", "gpu_count": 0, "devices": []}

    return result
Example #3
0
 def get_driver():
     """ Get the driver version """
     try:
         driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
     except pynvml.NVMLError:
         driver = "No Nvidia driver found"
     return driver
Example #4
0
def _update_nvml_static_info():
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    nvml_version = pynvml.nvmlSystemGetNVMLVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    devices_handles = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        devices.append({
            'index': i,
            'name': name,
            'memory': {
                'total': mem_info.total
            }
        })
        devices_handles.append(handle)
    _static_info['public'].update({
        'gpu': {
            'driver': driver_version,
            'nvml': nvml_version,
            'devices': devices
        }
    })
    _static_info['private'].update({'gpu': {'handles': devices_handles}})
Example #5
0
def get_driver_version():
    """
    Return current NVIDIA driver version
    """
    if not pynvml._nvmlLib_refcount:
        pynvml.nvmlInit()
    return pynvml.nvmlSystemGetDriverVersion()
Example #6
0
    def gpu_info(self):
        # pip install nvidia-ml-py3
        if len(self.gpu_ids) >= 0 and torch.cuda.is_available():
            try:
                import pynvml
                pynvml.nvmlInit()
                self.config_dic[
                    'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion(
                    )
                for gpu_id in self.gpu_ids:
                    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                    gpu_id_name = "gpu%s" % gpu_id
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    self.config_dic['%s_device_name' %
                                    gpu_id_name] = pynvml.nvmlDeviceGetName(
                                        handle)
                    self.config_dic['%s_mem_total' %
                                    gpu_id_name] = gpu_mem_total = round(
                                        mem_info.total / 1024**3, 2)
                    self.config_dic['%s_mem_used' %
                                    gpu_id_name] = gpu_mem_used = round(
                                        mem_info.used / 1024**3, 2)
                    # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2
                    self.config_dic['%s_mem_percent' % gpu_id_name] = round(
                        (gpu_mem_used / gpu_mem_total) * 100, 1)
                    self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name,
                                          gpu_utilize.gpu, 0.8)
                    # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu
                    # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory

                pynvml.nvmlShutdown()
            except Exception as e:
                print(e)
Example #7
0
 def initialize(self, **kwargs) -> None:
     try:
         nvmlInit()
         driver_version = nvmlSystemGetDriverVersion().decode("UTF-8")
         nvml_version = nvmlSystemGetNVMLVersion().decode("UTF-8")
         self.logger.info(f"NVML initialized, driver version: {driver_version}, NVML version: {nvml_version}")
         self.detect_devices()
     except NVMLError as error:
         self.raise_nvml_error(error)
Example #8
0
def run_logging_loop(async_task, async_loop):
    asyncio.set_event_loop(async_loop)
    pynvml.nvmlInit()
    logger = _logger()
    logger.info("Driver Version: {}".format(
        nativestr(pynvml.nvmlSystemGetDriverVersion())))
    async_loop.run_until_complete(async_task)
    logger.info("Shutting down driver")
    pynvml.nvmlShutdown()
Example #9
0
def check_perf():
    "Suggest how to improve the setup to speed things up"

    from PIL import features, Image
    from packaging import version
    import pynvml

    print("Running performance checks.")

    # libjpeg_turbo check
    print("\n*** libjpeg-turbo status")
    if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"):
        if features.check_feature('libjpeg_turbo'):
            print("✔ libjpeg-turbo is on")
        else:
            print("✘ libjpeg-turbo is not on. It's recommended you install libjpeg-turbo to speed up JPEG decoding. See https://docs.fast.ai/performance.html#libjpeg-turbo")
    else:
        print(f"❓ libjpeg-turbo's status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {Image.PILLOW_VERSION}")
        # XXX: remove this check/note once Pillow and Pillow-SIMD 5.4.0 is available
        pillow_ver_5_4_is_avail = pypi_module_version_is_available("Pillow", "5.4.0")
        if pillow_ver_5_4_is_avail == False:
            print("5.4.0 is not yet available, other than the dev version on github, which can be installed via pip from git+https://github.com/python-pillow/Pillow. See https://docs.fast.ai/performance.html#libjpeg-turbo")

    # Pillow-SIMD check
    print("\n*** Pillow-SIMD status")
    if re.search(r'\.post\d+', Image.PILLOW_VERSION):
        print(f"✔ Running Pillow-SIMD {Image.PILLOW_VERSION}")
    else:
        print(f"✘ Running Pillow {Image.PILLOW_VERSION}; It's recommended you install Pillow-SIMD to speed up image resizing and other operations. See https://docs.fast.ai/performance.html#pillow-simd")

    # CUDA version check
    # compatibility table: k: min nvidia ver is required for v: cuda ver
    # note: windows nvidia driver version is slightly higher, see:
    # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
    # note: add new entries if pytorch starts supporting new cudaXX
    nvidia2cuda = {
        "410.00": "10.0",
        "384.81":  "9.0",
        "367.48":  "8.0",
    }
    print("\n*** CUDA status")
    if torch.cuda.is_available():
        pynvml.nvmlInit()
        nvidia_ver = pynvml.nvmlSystemGetDriverVersion().decode('utf-8')
        cuda_ver   = torch.version.cuda
        max_cuda = "8.0"
        for k in sorted(nvidia2cuda.keys()):
            if version.parse(nvidia_ver) > version.parse(k): max_cuda = nvidia2cuda[k]
        if version.parse(str(max_cuda)) <= version.parse(cuda_ver):
            print(f"✔ Running the latest CUDA {cuda_ver} with NVIDIA driver {nvidia_ver}")
        else:
            print(f"✘ You are running pytorch built against cuda {cuda_ver}, your NVIDIA driver {nvidia_ver} supports cuda10. See https://pytorch.org/get-started/locally/ to install pytorch built against the faster CUDA version.")
    else:
        print(f"❓ Running cpu-only torch version, CUDA check is not relevant")

    print("\nRefer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions.")
Example #10
0
 def get_driver():
     """ Get the driver version """
     if is_macos:
         driver = pynvx.cudaSystemGetDriverVersion(ignore=True)
     else:
         try:
             driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
         except pynvml.NVMLError:
             driver = "No Nvidia driver found"
     return driver
Example #11
0
    def _get_driver_version(self):
        self._nvml_init()

        try:
            driver_version = self._decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None

        self._nvml_shutdown()

        return driver_version
Example #12
0
def check_nvidia_device():
    try:
        pynvml.nvmlInit()
        driver_version = float(pynvml.nvmlSystemGetDriverVersion())
        pynvml.nvmlShutdown()
        if driver_version < 367.48:
            raise OSError(
                'NVIDIA driver v.{} is not supported. The driver version must be 367.48 or newer'
                .format(driver_version))
    except pynvml.NVMLError:
        raise OSError('NVIDIA device not found')
Example #13
0
 def get_driver(self):
     """ Get the driver version """
     if IS_MACOS:
         driver = pynvx.cudaSystemGetDriverVersion(ignore=True)
     else:
         try:
             driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
         except pynvml.NVMLError:
             driver = "No Nvidia driver found"
     if self.logger:
         self.logger.debug("GPU Driver: %s", driver)
     return driver
Example #14
0
 def get_driver(self):
     """ Get the driver version """
     if IS_MACOS:
         driver = pynvx.cudaSystemGetDriverVersion(ignore=True)
     else:
         try:
             driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
         except pynvml.NVMLError:
             driver = "No Nvidia driver found"
     if self.logger:
         self.logger.debug("GPU Driver: %s", driver)
     return driver
def get_gpu_info(handle):
    # """
    # input: handle of GPU
    # output:cuda version,gpu name,total memory,used_memory,free memory, gpu_util_rate
    # """
    # https://docs.nvidia.com/deploy/nvml-api/
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    total_memory = info.total
    free_memory = info.free
    used_memory = info.used
    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
    gpu_util_rate = utilization.gpu
    return pynvml.nvmlSystemGetDriverVersion(), pynvml.nvmlDeviceGetName(handle), total_memory, free_memory, used_memory, gpu_util_rate
Example #16
0
def get_system_info():
    system_info = dict()

    # cpu info
    system_info['cpu_percent'] = psutil.cpu_percent(interval=None, percpu=False)
    system_info['cpu_count'] = psutil.cpu_count(logical=True)

    # memory info
    mem = psutil.virtual_memory()
    system_info['mem_total'] = int(mem.total / 1024 / 1024)
    system_info['mem_available'] = int(mem.available / 1024 / 1024)
    system_info['mem_percent'] = mem.percent

    # disk info
    disk = psutil.disk_usage('/')
    system_info['disk_total'] = int(disk.total / 1024 / 1024)
    system_info['disk_used'] = int(disk.used / 1024 / 1024)
    system_info['disk_percent'] = disk.percent

    # other info
    system_info['boot_time'] = psutil.boot_time()

    # gpu info
    if tf.test.is_gpu_available():
        pynvml.nvmlInit()
        gpu_driver_version = pynvml.nvmlSystemGetDriverVersion()
        system_info['gpu_driver_version'] = gpu_driver_version.decode("utf-8")

        gpu_device_count = pynvml.nvmlDeviceGetCount()

        system_info['gpu_device_list'] = []
        for i in range(gpu_device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            gpu_name = pynvml.nvmlDeviceGetName(handle)
            gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
            gpu_mem_total = int(gpu_mem.total / 1024 / 1024)
            gpu_mem_used = int(gpu_mem.used / 1024 / 1024)
            gpu_mem_percent = int(gpu_mem_used / gpu_mem_total)

            system_info['gpu_device_list'].append(
                {'gpu_name': gpu_name.decode("utf-8"),
                 'gpu_mem_total': gpu_mem_total,
                 'gpu_mem_used': gpu_mem_used,
                 'gpu_mem_percent': gpu_mem_percent
                 }
            )

        pynvml.nvmlShutdown()

    return system_info
Example #17
0
    def _get_driver(self) -> str:
        """ Obtain the Nvidia driver version currently in use.

        Returns
        -------
        str
            The current GPU driver version
        """
        try:
            driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
        except pynvml.NVMLError as err:
            self._log("debug",
                      f"Unable to obtain driver. Original error: {str(err)}")
            driver = "No Nvidia driver found"
        self._log("debug", f"GPU Driver: {driver}")
        return driver
Example #18
0
def get_nvml_driver_version():
    try:
        from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion
        try:
            nvmlInit()
            v = nvmlSystemGetDriverVersion()
            log("nvmlSystemGetDriverVersion=%s", v)
            return v.split(".")
        except Exception as e:
            log.warn("Warning: failed to query the NVidia kernel module version via NVML:")
            log.warn(" %s", e)
        finally:
            nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return ""
Example #19
0
 def GetGPUstate(self):
     pynvml.nvmlInit()
     gpudriver_info = pynvml.nvmlSystemGetDriverVersion()  # 获取驱动信息
     gpu_count = pynvml.nvmlDeviceGetCount()  # 显卡数量
     handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # 这里的0是GPU id
     gpu_name = pynvml.nvmlDeviceGetName(handle)  # 显卡型号
     meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
     line = {
         'gpudriver_info': gpudriver_info,
         'gpu_count': gpu_count,  # 显卡数量
         'gpu_name': gpu_name,  # 显卡型号
         'gpumem_all': int(meminfo.total / 1024 / 1024),  # 总显存大小
         'gpumem_use': int(meminfo.used / 1024 / 1024),  # 已用显存
         'gpumem_free': int(meminfo.free / 1024 / 1024),  # 剩余显存
         'cur_gpu': float(meminfo.used / meminfo.total)  # 显存使用率
     }
     return line
Example #20
0
def is_of_supported(device_id=0):
    global is_of_supported_var
    if is_of_supported_var is not None:
        return is_of_supported_var

    driver_version_major = 0
    try:
        import pynvml
        pynvml.nvmlInit()
        driver_version = pynvml.nvmlSystemGetDriverVersion().decode('utf-8')
        driver_version_major = int(driver_version.split('.')[0])
    except ModuleNotFoundError:
        print("NVML not found")

    # there is an issue with OpticalFlow driver in R495 and newer on aarch64 platform
    is_of_supported_var = get_arch(device_id) >= 7.5 and (
        platform.machine() == "x86_64" or driver_version_major < 495)
    return is_of_supported_var
Example #21
0
def get_nvml_driver_version():
    try:
        from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion
        try:
            if wrap_nvml_init(nvmlInit):
                v = nvmlSystemGetDriverVersion()
                log("nvmlSystemGetDriverVersion=%s", bytestostr(v))
                return v.split(b".")
        except Exception as e:
            log("get_nvml_driver_version() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia kernel module version using NVML:")
            log.warn(" %s", e)
        finally:
            nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return ()
Example #22
0
    def _get_driver(self):
        """ Obtain and return the installed driver version for the system's GPUs.

        Returns
        -------
        str
            The currently installed GPU driver version
        """
        if self._is_plaidml:
            driver = self._plaid.drivers
        elif IS_MACOS:
            driver = pynvx.cudaSystemGetDriverVersion(ignore=True)
        else:
            try:
                driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
            except pynvml.NVMLError:
                driver = "No Nvidia driver found"
        self._log("debug", "GPU Driver: {}".format(driver))
        return driver
Example #23
0
def get_nvml_driver_version():
    try:
        from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion
        try:
            nvmlInit()
            v = nvmlSystemGetDriverVersion()
            log("nvmlSystemGetDriverVersion=%s", v)
            return v.split(".")
        except Exception as e:
            log.warn(
                "Warning: failed to query the NVidia kernel module version via NVML:"
            )
            log.warn(" %s", e)
        finally:
            nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return ""
Example #24
0
def get_machine_config():
    """Get machine config for CPU and GPU(s)"""

    # CPU config
    physical_cores = psutil.cpu_count(logical=False)
    logical_cores = psutil.cpu_count(logical=True)

    cpufreq = psutil.cpu_freq()
    cpufreq_max = cpufreq.max  # Mhz
    cpufreq_min = cpufreq.min
    cpufreq_cur = cpufreq.current

    svmem = psutil.virtual_memory()
    mem_total = svmem.total / (1024.0**3)  # GB
    mem_avail = svmem.available / (1024.0**3)

    # GPU config
    nv.nvmlInit()
    driver_version = nv.nvmlSystemGetDriverVersion()
    deviceCount = nv.nvmlDeviceGetCount()
    gpu_devices, gpu_mems = [], []
    for i in range(deviceCount):
        handle = nv.nvmlDeviceGetHandleByIndex(i)
        gpu_devices.append(nv.nvmlDeviceGetName(handle).decode("utf-8"))
        gpu_mem = nv.nvmlDeviceGetMemoryInfo(handle).total / (1024.0**3)
        gpu_mems.append(gpu_mem)

    return {
        'cpu': {
            'physical_cores': physical_cores,
            'logical_cores': logical_cores,
            'min_freq_MHz': cpufreq_min,
            'max_freq_MHz': cpufreq_max,
            'cur_freq_MHz': cpufreq_cur,
            'total_mem_GB': mem_total,
            'avail_mem_GB': mem_avail
        },
        'gpu': {
            'devices': gpu_devices,
            'mem_GB': gpu_mems
        }
    }
Example #25
0
def gputask():
    def get(index):
        try:
            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
        except pynvml.NVMLError_GpuIsLost:
            return None
        memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return dict(
            nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'),
            nvmlDeviceGetMemoryInfo=dict(
                total=memory_info.total,
                free=memory_info.free,
                used=memory_info.used,
            ),
            nvmlDeviceGetUtilizationRates=get_utilization_rates(handle),
            nvmlDeviceGetFanSpeed=get_fan_speed(handle),
            nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature(
                handle, pynvml.NVML_TEMPERATURE_GPU),
            nvmlDeviceGetTemperatureThreshold=dict(
                slowdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                    handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN),
                shutdown=pynvml.nvmlDeviceGetTemperatureThreshold(
                    handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
            ),
            nvmlDeviceGetPowerManagementLimit=pynvml.
            nvmlDeviceGetPowerManagementLimit(handle),
            nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle),
        )

    try:
        pynvml.nvmlInit()
        res = dict(
            nvml_version=pynvml.nvmlSystemGetDriverVersion().decode(),
            nvmlDeviceGetCount=pynvml.nvmlDeviceGetCount(),
            nvmlDevices=[get(i) for i in range(pynvml.nvmlDeviceGetCount())],
        )
        pynvml.nvmlShutdown()
        return res
    except Exception:
        return dict(nvml_version=None, )
Example #26
0
def get_nv_info():
    nv_info = dict()
    try:
        nvmlInit()

        nv_info["_Driver_Version"] = str(nvmlSystemGetDriverVersion(),
                                         errors="ignore")
        nv_info["_NVML_Version"] = str(nvmlSystemGetNVMLVersion(),
                                       errors="ignore")

        device_count = nvmlDeviceGetCount()
        nv_info["Device_Count"] = device_count

        devices = []

        for i in range(device_count):
            dev_info = dict()

            handle = nvmlDeviceGetHandleByIndex(i)
            dev_info["_Name"] = str(nvmlDeviceGetName(handle), errors="ignore")

            memory_info = nvmlDeviceGetMemoryInfo(handle)
            dev_info["Total_Memory"] = memory_info.total
            dev_info["Free_Memory"] = memory_info.free
            dev_info["Used_Memory"] = memory_info.used

            util_rates = nvmlDeviceGetUtilizationRates(handle)
            dev_info["GPU_Utilization_Rate"] = util_rates.gpu
            dev_info["Memory_Utilization_Rate"] = util_rates.memory

            devices.append(dev_info)

        nv_info["Devices"] = devices

        nvmlShutdown()

    except Exception as e:
        nv_info["Exception"] = str(e)

    return nv_info
Example #27
0
def is_of_supported(device_id=0):
    global is_of_supported_var
    if is_of_supported_var is not None:
        return is_of_supported_var

    compute_cap = 0
    driver_version_major = 0
    try:
        import pynvml
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        compute_cap = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
        compute_cap = compute_cap[0] + compute_cap[1] / 10.
        driver_version = pynvml.nvmlSystemGetDriverVersion().decode('utf-8')
        driver_version_major = int(driver_version.split('.')[0])
    except ModuleNotFoundError:
        print("NVML not found")

    # there is an issue with OpticalFlow driver in R495 and newer on aarch64 platform
    is_of_supported_var = compute_cap >= 7.5 and (
        platform.machine() == "x86_64" or driver_version_major < 495)
    return is_of_supported_var
Example #28
0
def gpu_info():
    try:
        pynvml.nvmlInit()
        # print("Driver Version:", pynvml.nvmlSystemGetDriverVersion()) #gpu version
        deviceCount = pynvml.nvmlDeviceGetCount()  #gpu count
        gpu_info = {}
        for i in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            # print("Device", i, ":", pynvml.nvmlDeviceGetName(handle)) #gpu name
            gpu_info[str(i)] = {
                'version': str(pynvml.nvmlSystemGetDriverVersion()),
                'name': str(pynvml.nvmlDeviceGetName(handle)),
                'used': meminfo.used / 1024 / 1024 / 1024,
                'free': meminfo.free / 1024 / 1024 / 1024,
                'total': meminfo.total / 1024 / 1024 / 1024,
                'percent': (meminfo.used) / (meminfo.total) * 100
            }
            pynvml.nvmlShutdown()
        return gpu_info
    except Exception as e:
        logging.info(f'GPU Erorr:[{e}]')
Example #29
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Example #30
0
    def check(self):
        ## Check legacy mode
        try:
            self.legacy = self.configuration['legacy']
            if self.legacy == '': raise KeyError
            if self.legacy is True: self.info('Legacy mode set to True')
        except KeyError:
            self.legacy = False
            self.info("No legacy mode specified. Setting to 'False'")

        ## Real memory clock is double (DDR double data rate ram). Set nvMemFactor = 2 in conf for 'real' memory clock
        try:
            self.nvMemFactor = int(self.configuration['nvMemFactor'])
            if self.nvMemFactor == '': raise KeyError
            self.info("'nvMemFactor' set to:", str(self.nvMemFactor))
        except Exception as e:
            if isinstance(e, KeyError):
                self.info("No 'nvMemFactor' configured. Setting to 1")
            else:
                self.error(
                    "nvMemFactor in config file is not an int. Setting 'nvMemFactor' to 1",
                    str(e))
            self.nvMemFactor = 1

        ## Initialize NVML
        try:
            pynvml.nvmlInit()
            self.info("Nvidia Driver Version:",
                      str(pynvml.nvmlSystemGetDriverVersion()))
        except Exception as e:
            self.error("pynvml could not be initialized", str(e))
            pynvml.nvmlShutdown()
            return False

        ## Get number of graphic cards
        try:
            self.unitCount = pynvml.nvmlUnitGetCount()
            self.deviceCount = pynvml.nvmlDeviceGetCount()
            self.debug("Unit count:", str(self.unitCount))
            self.debug("Device count", str(self.deviceCount))
        except Exception as e:
            self.error('Error getting number of Nvidia GPUs', str(e))
            pynvml.nvmlShutdown()
            return False

        ## Get graphic card names
        data = self._get_data()
        name = ''
        for i in range(self.deviceCount):
            if i == 0:
                name = name + str(
                    data["device_name_" + str(i)]) + " [{0}]".format(i)
            else:
                name = name + ' | ' + str(
                    data["device_name_" + str(i)]) + " [{0}]".format(i)
        self.info('Graphics Card(s) found:', name)
        for chart in self.definitions:
            self.definitions[chart]['options'][
                1] = self.definitions[chart]['options'][1] + ' for ' + name
        ## Dynamically add lines
        for i in range(self.deviceCount):
            gpuIdx = str(i)
            ## Memory
            if data['device_mem_used_' + str(i)] is not None:
                self.definitions['memory']['lines'].append([
                    'device_mem_free_' + gpuIdx, 'free [{0}]'.format(i),
                    'absolute', 1, 1024**2
                ])
                self.definitions['memory']['lines'].append([
                    'device_mem_used_' + gpuIdx, 'used [{0}]'.format(i),
                    'absolute', 1, 1024**2
                ])
            # self.definitions['memory']['lines'].append(['device_mem_total_' + gpuIdx, 'GPU:{0} total'.format(i), 'absolute', -1, 1024**2])

            ## Load/usage
            if data['device_load_gpu_' + gpuIdx] is not None:
                self.definitions['load']['lines'].append([
                    'device_load_gpu_' + gpuIdx, 'gpu [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['load']['lines'].append([
                    'device_load_mem_' + gpuIdx, 'memory [{0}]'.format(i),
                    'absolute'
                ])

## Encoder Utilization
            if data['device_load_enc_' + gpuIdx] is not None:
                self.definitions['load']['lines'].append([
                    'device_load_enc_' + gpuIdx, 'enc [{0}]'.format(i),
                    'absolute'
                ])

## Decoder Utilization
            if data['device_load_dec_' + gpuIdx] is not None:
                self.definitions['load']['lines'].append([
                    'device_load_dec_' + gpuIdx, 'dec [{0}]'.format(i),
                    'absolute'
                ])

            ## ECC errors
            if data['device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_' +
                    gpuIdx] is not None:
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_' + gpuIdx,
                    'L1 Cache Volatile Corrected [{0}]'.format(i), 'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_' +
                    gpuIdx, 'L1 Cache Volatile Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_' + gpuIdx,
                    'L1 Cache Aggregate Corrected [{0}]'.format(i), 'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_' +
                    gpuIdx, 'L1 Cache Aggregate Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_' + gpuIdx,
                    'L2 Cache Volatile Corrected [{0}]'.format(i), 'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_' +
                    gpuIdx, 'L2 Cache Volatile Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_' + gpuIdx,
                    'L2 Cache Aggregate Corrected [{0}]'.format(i), 'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_' +
                    gpuIdx, 'L2 Cache Aggregate Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_' +
                    gpuIdx, 'Device Memory Volatile Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_' +
                    gpuIdx,
                    'Device Memory Volatile Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_' +
                    gpuIdx,
                    'Device Memory Aggregate Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_' +
                    gpuIdx,
                    'Device Memory Aggregate Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_' +
                    gpuIdx, 'Register File Volatile Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_' +
                    gpuIdx,
                    'Register File Volatile Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_' +
                    gpuIdx,
                    'Register File Aggregate Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_' +
                    gpuIdx,
                    'Register File Aggregate Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_' +
                    gpuIdx,
                    'Texture Memory Volatile Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_' +
                    gpuIdx,
                    'Texture Memory Volatile Uncorrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_' +
                    gpuIdx,
                    'Texture Memory Aggregate Corrected [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['ecc_errors']['lines'].append([
                    'device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_' +
                    gpuIdx,
                    'Texture Memory Aggregate Uncorrected [{0}]'.format(i),
                    'absolute'
                ])

            ## Temperature
            if data['device_temp_' + gpuIdx] is not None:
                self.definitions['temperature']['lines'].append(
                    ['device_temp_' + gpuIdx, 'GPU:{0}'.format(i), 'absolute'])

            ## Fan
            if data['device_fanspeed_' + gpuIdx] is not None:
                self.definitions['fan']['lines'].append([
                    'device_fanspeed_' + gpuIdx, 'GPU:{0}'.format(i),
                    'absolute'
                ])

            ## GPU and Memory frequency
            if data['device_core_clock_' + gpuIdx] is not None:
                self.definitions['frequency']['lines'].append([
                    'device_core_clock_' + gpuIdx, 'core [{0}]'.format(i),
                    'absolute'
                ])
                self.definitions['frequency']['lines'].append([
                    'device_mem_clock_' + gpuIdx, 'memory [{0}]'.format(i),
                    'absolute'
                ])
            ## SM frequency, usually same as GPU - handled extra here because of legacy mode
            if data['device_sm_clock_' + gpuIdx] is not None:
                self.definitions['frequency']['lines'].append([
                    'device_sm_clock_' + gpuIdx, 'sm [{0}]'.format(i),
                    'absolute'
                ])

        ## Check if GPU Units are installed and add charts
        if self.unitCount:
            self.order.append('unit_fan')
            self.order.append('unit_psu')
            for i in range(self.unitCount):
                gpuIdx = str(i)
                if data['unit_temp_intake_' + gpuIdx] is not None:
                    self.definitions['temperature']['lines'].append([
                        'unit_temp_intake_' + gpuIdx,
                        'intake (unit {0})'.format(i), 'absolute'
                    ])
                    self.definitions['temperature']['lines'].append([
                        'unit_temp_exhaust_' + gpuIdx,
                        'exhaust (unit {0})'.format(i), 'absolute'
                    ])
                    self.definitions['temperature']['lines'].append([
                        'unit_temp_board_' + gpuIdx,
                        'board (unit {0})'.format(i), 'absolute'
                    ])
                if data['unit_fan_speed_' + gpuIdx] is not None:
                    self.definitions['unit_fan'] = {
                        'options': [
                            None, 'Unit fan', 'rpm', 'Unit Fans', 'nv.unit',
                            'line'
                        ],
                        'lines': [[
                            'unit_fan_speed_' + gpuIdx, 'Unit{0}'.format(i),
                            'absolute'
                        ]]
                    }
                if data['unit_psu_current_' + gpuIdx] is not None:
                    self.definitions['unit_psu'] = {
                        'options': [
                            None, 'Unit PSU', 'mixed', 'Unit PSU', 'nv.unit',
                            'line'
                        ],
                        'lines':
                        [[
                            'unit_psu_current_' + gpuIdx,
                            'current (A) (unit {0})'.format(i), 'absolute'
                        ],
                         [
                             'unit_psu_power_' + gpuIdx,
                             'power (W) (unit {0})'.format(i), 'absolute'
                         ],
                         [
                             'unit_psu_voltage_' + gpuIdx,
                             'voltage (V) (unit {0})'.format(i), 'absolute'
                         ]]
                    }
        return True
Example #31
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None  # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Example #32
0
def systemGetDriverVersion():
    return pynvml.nvmlSystemGetDriverVersion()
Example #33
0
    sys.path.append(pypath)


from pynvml import (
    nvmlInit,
    nvmlShutdown,
    nvmlSystemGetDriverVersion,
    nvmlDeviceGetCount,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetName,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetUtilizationRates,
)

nvmlInit()
print("Driver Version: %s" % nvmlSystemGetDriverVersion())

deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvmlDeviceGetHandleByIndex(i)
    print("Device %s: %s" % (i, nvmlDeviceGetName(handle)))

    memory_info = nvmlDeviceGetMemoryInfo(handle)
    print("Device %s: Total memory: %s" % (i, memory_info.total / 1024 / 1024))
    print("Device %s: Free memory: %s" % (i, memory_info.free / 1024 / 1024))
    print("Device %s: Used memory: %s" % (i, memory_info.used / 1024 / 1024))

    util = nvmlDeviceGetUtilizationRates(handle)
    print("Device %s: GPU Utilization: %s%%" % (i, util.gpu))
    print("Device %s: Memory Utilization: %s%%" % (i, util.memory))