Beispiel #1
0
    def get_gpu_info_by_nvml(self) -> Dict:
        """Get GPU info using nvml"""
        gpu_info_list = []
        driver_version = None
        try:
            nvmlInit()
            driver_version = nvmlSystemGetDriverVersion()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_info = {}
                gpu_info["memory_total"] = info.total
                gpu_info["memory_available"] = info.free
                gpu_info["name"] = nvmlDeviceGetName(handle)
                gpu_info_list.append(gpu_info)
            nvmlShutdown()
        except NVMLError as error:
            if not self.silent:
                self.logger.error(
                    "Error fetching GPU information using nvml: %s", error)
            return None

        result = {"driver_version": driver_version, "devices": gpu_info_list}

        if 'CUDA_VISIBLE_DEVICES' in environ:
            result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES']
        return result
Beispiel #2
0
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]:
    """
    Get driver version and list of ``GpuInfo``, if available.
    """
    try:
        nvml.nvmlInit()
    except nvml.NVMLError:
        # Not available.
        return None, None

    driver_version: str = nvml.nvmlSystemGetDriverVersion()
    gpus: List[GpuInfo] = []

    device_count: int = nvml.nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvml.nvmlDeviceGetHandleByIndex(i)
        name = try_get_info(nvml.nvmlDeviceGetName, handle)
        fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0)
        temp = try_get_info(
            lambda h: nvml.nvmlDeviceGetTemperature(h, nvml.
                                                    NVML_TEMPERATURE_GPU),
            handle,
            default=0,
        )
        mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
        if mem_info:
            mem_used = mem_info.used >> 20
            mem_total = mem_info.total >> 20
        else:
            mem_used = 0
            mem_total = 0
        util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
        if util:
            gpu_util = util.gpu
        else:
            gpu_util = 0
        gpus.append(
            GpuInfo(
                id=i,
                name=name,
                mem_usage=mem_used,
                mem_capacity=mem_total,
                utilization=gpu_util,
                temp=temp,
                fan=fan_speed,
            ))

    nvml.nvmlShutdown()

    return driver_version, gpus
Beispiel #3
0
def test_nvidia():
    # pip install py3nvml
    import py3nvml
    from py3nvml import py3nvml as nvml

    inspect(py3nvml.get_free_gpus())

    nvml.nvmlInit()
    inspect(version=nvml.nvmlSystemGetDriverVersion())
    inspect(count=nvml.nvmlDeviceGetCount())

    for i in range(nvml.nvmlDeviceGetCount()):
        test_nvidia_device(i)

    nvml.nvmlShutdown()
    def __init__(self):
        self.labels = ['gpu', 'name', 'driver']
        self.driver = nv.nvmlSystemGetDriverVersion()

        self.n_gpu = nv.nvmlDeviceGetCount()
        self.hnds = [
            nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu)
        ]
        self.args = []
        for i, hnd in enumerate(self.hnds):
            args = OrderedDict()
            args['gpu'] = 'gpu%d' % i
            args['name'] = nv.nvmlDeviceGetName(hnd)
            args['driver'] = self.driver
            self.args.append(args)
Beispiel #5
0
def gpu_status():
    try:
        py3nvml.nvmlInit()
        device_count = py3nvml.nvmlDeviceGetCount()

        devices = []
        for i in range(device_count):
            gpu = {}
            handle = py3nvml.nvmlDeviceGetHandleByIndex(i)

            memory = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMemoryInfo, handle))
            if memory:
                memory = round(memory.total * 1.0 / 2**30, 2)

            gpu['name'] = _nmvl_call(partial(py3nvml.nvmlDeviceGetName,
                                             handle))
            gpu['clock'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetApplicationsClock, handle,
                        py3nvml.NVML_CLOCK_GRAPHICS))
            gpu['clock_mem'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetApplicationsClock, handle,
                        py3nvml.NVML_CLOCK_MEM))
            gpu['clock_max'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle,
                        py3nvml.NVML_CLOCK_GRAPHICS))
            gpu['clock_mem_max'] = _nmvl_call(
                partial(py3nvml.nvmlDeviceGetMaxClockInfo, handle,
                        py3nvml.NVML_CLOCK_MEM))
            gpu['memory'] = memory

            devices.append(gpu)
        nvidia = {
            'driver_version': py3nvml.nvmlSystemGetDriverVersion(),
            'devices': devices
        }

        return nvidia
    except Exception as e:
        return None
Beispiel #6
0
 def __init__(self):
     py3nvml.nvmlInit()
     self.driver_version = py3nvml.nvmlSystemGetDriverVersion()
     self.gpus = GpuList()
     self.update()
Beispiel #7
0
#!/usr/bin/env python3

# need package: py3nvml
# if you use python 2, you need nvidia-ml-py and change the import

from __future__ import print_function

# import pynvml
import py3nvml.py3nvml as pynvml
import datetime

pynvml.nvmlInit()
print("Driver Version:", pynvml.nvmlSystemGetDriverVersion())

deviceCount = pynvml.nvmlDeviceGetCount()

for i in range(deviceCount):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    print("Device {}: {}".format(i, pynvml.nvmlDeviceGetName(handle)))

pynvml.nvmlShutdown()
 def _get_driver_version():
     return {'driver_version': pynvml.nvmlSystemGetDriverVersion()}
Beispiel #9
0
    def getGpuInfo(self):
        if (self._impulse % 2) != 0:
            return self._gpuInfoObj

        try:
            N.nvmlInit()
            gpuInfoObj = {}

            driverVersion = N.nvmlSystemGetDriverVersion()
            deviceCnt = N.nvmlDeviceGetCount()

            gpuInfoObj['DRIVER_VERSION'] = driverVersion
            gpuInfoObj['DEVICE_COUNT'] = deviceCnt

            for dCnt in range(deviceCnt):
                deviceInfoObj = {}
                handle = N.nvmlDeviceGetHandleByIndex(dCnt)
                name = N.nvmlDeviceGetName(handle)

                try:
                    fan = N.nvmlDeviceGetFanSpeed(handle)
                except N.NVMLError as err:
                    fan = 'N/A'

                try:
                    temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
                except N.NVMLError as err:
                    temp = 'N/A'

                try:
                    powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000)
                except N.NVMLError as err:
                    powerUsage = 'N/A'

                try:
                    powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                except N.NVMLError as err:
                    powerLimit = 'N/A'

                try:
                    memInfo = N.nvmlDeviceGetMemoryInfo(handle)
                    memUsage = round(memInfo.used/1024/1024)
                    memTotal = round(memInfo.total/1024/1024)
                except N.NVMLError as err:
                    memUsage = 'N/A'
                    memTotal = 'N/A'

                try:
                    util = N.nvmlDeviceGetUtilizationRates(handle).gpu
                except N.NVMLError as err:
                    util = 'N/A'

                deviceInfoObj['NAME'] = name
                deviceInfoObj['FAN'] = fan
                deviceInfoObj['TEMP'] = temp
                deviceInfoObj['POWER_USAGE'] = powerUsage
                deviceInfoObj['POWER_LIMIT'] = powerLimit
                deviceInfoObj['MEM_USAGE'] = memUsage
                deviceInfoObj['MEM_TOTAL'] = memTotal
                deviceInfoObj['UTIL'] = util

                gpuProcessObj = {}
                try:
                    processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                except N.NVMLError as err:
                    processes = []
                for pCnt, process in enumerate(processes):
                    gpuMem = round(process.usedGpuMemory / 1024 / 1024)
                    pid = process.pid

                    try:
                        p = psutil.Process(pid)
                        attrs = p.as_dict(attrs = ['name', 'username', 'status'])
                    except psutil.ZombieProcess:
                        attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'}
                    except:
                        pass
                    
                    gpuProcessObj[str(pCnt)] = {
                        'PID': pid,
                        'MEM': gpuMem,
                        'NAME': attrs['name'],
                        'USERNAME': self._getSubuidName(attrs['username']),
                        'STATUS': attrs['status']
                    }

                deviceInfoObj['PROCESS'] = gpuProcessObj
                gpuInfoObj[str(dCnt)] = deviceInfoObj

            N.nvmlShutdown()

        except N.NVMLError as err:
            N.nvmlShutdown()
            print(err)
            gpuInfoObj = {}

        self._gpuInfoObj = gpuInfoObj
        return gpuInfoObj