Example #1
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
Example #2
0
    def get_stats(self):
        """
        Get system statistics and assign to `self`
        """
        memory_usage = psutil.virtual_memory()
        disk_usage = psutil.disk_usage('/')
        # net = psutil.net_io_counters()
        system = {
            # CPU utilization percent(can be over 100%)
            'cpu':
            round10e5(self._process.cpu_percent(0.0)),

            # Whole system memory usage
            # 'memory_used': round10e5(memory_usage.used / 1024 / 1024),
            'memory_percent':
            round10e5(memory_usage.used * 100 / memory_usage.total),

            # Get the portion of memory occupied by a process
            # 'p_memory_rss': round10e5(self._process.memory_info().rss
            #                           / 1024 / 1024),
            'p_memory_percent':
            round10e5(self._process.memory_percent()),

            # Disk usage
            # 'disk_used': round10e5(disk_usage.used / 1024 / 1024),
            'disk_percent':
            round10e5(disk_usage.percent),
        }

        # Collect GPU statistics
        gpus = []
        try:
            gpu_device_count = nvml.nvmlDeviceGetCount()
            for i in range(gpu_device_count):
                handle = nvml.nvmlDeviceGetHandleByIndex(i)
                nvml_tmp = nvml.NVML_TEMPERATURE_GPU

                # Get device memory and temperature
                util = nvml.nvmlDeviceGetUtilizationRates(handle)
                memory = nvml.nvmlDeviceGetMemoryInfo(handle)
                temp = nvml.nvmlDeviceGetTemperature(handle, nvml_tmp)

                # Compute power usage in watts and percent
                power_watts = nvml.nvmlDeviceGetPowerUsage(handle) / 1000
                power_cap = nvml.nvmlDeviceGetEnforcedPowerLimit(handle)
                power_cap_watts = power_cap / 1000
                power_watts / power_cap_watts * 100

                gpus.append({
                    # GPU utilization percent
                    'gpu':
                    round10e5(util.gpu),

                    # Device memory usage
                    # 'memory_used': round10e5(memory.used / 1024 / 1024),
                    'gpu_memory_percent':
                    round10e5(memory.used * 100 / memory.total),

                    # Power usage in watts and percent
                    'gpu_power_watts':
                    round10e5(power_watts),
                    # 'power_percent': round10e5(power_usage),

                    # Device temperature
                    'gpu_temp':
                    round10e5(temp),
                })
        except Exception:
            pass

        return system, gpus
 def power_usage(self, hnd):
     return nv.nvmlDeviceGetPowerUsage(hnd) / 1000
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""

        def get_process_info(nv_process):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=nv_process.pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except:
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except:
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        index = N.nvmlDeviceGetIndex(handle)
        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'temperature.gpu': temperature,
            'utilization.gpu': utilization.gpu if utilization else None,
            'power.draw': int(power / 1000) if power is not None else None,
            'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None,
            # Convert bytes into MBytes
            'memory.used': int(memory.used / 1024 / 1024) if memory else None,
            'memory.total': int(memory.total / 1024 / 1024) if memory else None,
            'processes': processes,
        }
        return gpu_info
 def _get_power_usage_watts(gpu):
     return {'power_watts': (pynvml.nvmlDeviceGetPowerUsage(gpu) / 1000.0)}
Example #6
0
    def getGpuInfo(self):
        if (self._impulse % 2) != 0:
            return self._gpuInfoObj

        try:
            N.nvmlInit()
            gpuInfoObj = {}

            driverVersion = N.nvmlSystemGetDriverVersion()
            deviceCnt = N.nvmlDeviceGetCount()

            gpuInfoObj['DRIVER_VERSION'] = driverVersion
            gpuInfoObj['DEVICE_COUNT'] = deviceCnt

            for dCnt in range(deviceCnt):
                deviceInfoObj = {}
                handle = N.nvmlDeviceGetHandleByIndex(dCnt)
                name = N.nvmlDeviceGetName(handle)

                try:
                    fan = N.nvmlDeviceGetFanSpeed(handle)
                except N.NVMLError as err:
                    fan = 'N/A'

                try:
                    temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
                except N.NVMLError as err:
                    temp = 'N/A'

                try:
                    powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000)
                except N.NVMLError as err:
                    powerUsage = 'N/A'

                try:
                    powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                except N.NVMLError as err:
                    powerLimit = 'N/A'

                try:
                    memInfo = N.nvmlDeviceGetMemoryInfo(handle)
                    memUsage = round(memInfo.used/1024/1024)
                    memTotal = round(memInfo.total/1024/1024)
                except N.NVMLError as err:
                    memUsage = 'N/A'
                    memTotal = 'N/A'

                try:
                    util = N.nvmlDeviceGetUtilizationRates(handle).gpu
                except N.NVMLError as err:
                    util = 'N/A'

                deviceInfoObj['NAME'] = name
                deviceInfoObj['FAN'] = fan
                deviceInfoObj['TEMP'] = temp
                deviceInfoObj['POWER_USAGE'] = powerUsage
                deviceInfoObj['POWER_LIMIT'] = powerLimit
                deviceInfoObj['MEM_USAGE'] = memUsage
                deviceInfoObj['MEM_TOTAL'] = memTotal
                deviceInfoObj['UTIL'] = util

                gpuProcessObj = {}
                try:
                    processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                except N.NVMLError as err:
                    processes = []
                for pCnt, process in enumerate(processes):
                    gpuMem = round(process.usedGpuMemory / 1024 / 1024)
                    pid = process.pid

                    try:
                        p = psutil.Process(pid)
                        attrs = p.as_dict(attrs = ['name', 'username', 'status'])
                    except psutil.ZombieProcess:
                        attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'}
                    except:
                        pass
                    
                    gpuProcessObj[str(pCnt)] = {
                        'PID': pid,
                        'MEM': gpuMem,
                        'NAME': attrs['name'],
                        'USERNAME': self._getSubuidName(attrs['username']),
                        'STATUS': attrs['status']
                    }

                deviceInfoObj['PROCESS'] = gpuProcessObj
                gpuInfoObj[str(dCnt)] = deviceInfoObj

            N.nvmlShutdown()

        except N.NVMLError as err:
            N.nvmlShutdown()
            print(err)
            gpuInfoObj = {}

        self._gpuInfoObj = gpuInfoObj
        return gpuInfoObj
class NvidiaDeviceStatistics(Callback):
    reportable_values = dict(
        memory_total=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).total),
        memory_used=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).used),
        memory_free=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).total - nvml.
            nvmlDeviceGetMemoryInfo(handle).used),
        temperature=lambda handle: nvml.nvmlDeviceGetTemperature(
            handle, nvml.NVML_TEMPERATURE_GPU),
        power_state=lambda handle: nvml.nvmlDeviceGetPowerState(handle),
        power_draw=lambda handle: nvml.nvmlDeviceGetPowerUsage(handle) /
        1000.0,
        utilization_gpu=lambda handle: nvml.nvmlDeviceGetUtilizationRates(
            handle).gpu,
        utilization_memory=lambda handle: nvml.nvmlDeviceGetUtilizationRates(
            handle).memory,
    )

    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix

    def __del__(self):
        if nvml:
            try:
                nvml.nvmlShutdown()
            except Exception:
                pass

    def on_epoch_end(self, epoch, logs=None):
        for item in self.report:
            try:
                suffix = handle = None
                for n, handle in enumerate(self.deviceHandles):
                    if len(self.deviceHandles) == 1 and not self.always_suffix:
                        suffix = ''
                    else:
                        suffix = '_%02d' % (
                            n,
                        )  # TODO: this will not work nicely if more than 100 GPUs are in one sys

                    logs[item + suffix] = np.float32(
                        self.reportable_values[item](handle))
            except nvml.NVMLError as err:
                self.output("Error trying to read out value from NVML: %r" %
                            (err, ))
        if self.report and self.verbose_once:
            self.output("Current status for device #% 2d (%s): %r" %
                        (n, nvml.nvmlDeviceGetName(handle), {
                            what: float(call(handle))
                            for what, call in self.reportable_values.items()
                        }))
            self.verbose_once = False  # only print once