def collect_metrics(self):
        """
    Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.)
    """
        data_list = []
        for gpu_num in range(nvmlDeviceGetCount()):
            handle = nvmlDeviceGetHandleByIndex(gpu_num)
            device_name = DEVICE_NAME_FORMAT % gpu_num
            power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0
            fan_speed = nvmlDeviceGetFanSpeed(handle)
            temperature = nvmlDeviceGetTemperature(handle,
                                                   NVML_TEMPERATURE_GPU)
            data_list.append({
                'measurement': device_name,
                'tags': {
                    'host': 'minar',
                    'gpu': device_name
                },
                'fields': {
                    'power_usage': power_usage,
                    'fan_speed': fan_speed,
                    'temperature': temperature
                }
            })
            time.sleep(PERIOD_SECS)

        return data_list
Esempio n. 2
0
def get_fan_speed(handle):
    fan = -1
    try:
        fan = pynvml.nvmlDeviceGetFanSpeed(handle)
    except Exception:
        pass

    return fan
Esempio n. 3
0
def get_fan_speed(handle):
    fan = -1
    try:
        fan = pynvml.nvmlDeviceGetFanSpeed(handle)
    except Exception:
        pass

    return fan
Esempio n. 4
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Esempio n. 5
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = (nv_process.usedGpuMemory // MB if
                           nv_process.usedGpuMemory else None)
                process['gpu_memory_usage'] = usedmem
                # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem)
                process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = "%d MiB" % (
                #     round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['cpu_memory_usage'] = (
                    round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    try:
                        process['cpu_percent'] = cache_process.cpu_percent()
                    except psutil.NoSuchProcess:
                        process['cpu_percent'] = 0.0
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        process['cpu_percent'] = 0.0
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else 0,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else 0,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else 0,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else 0,
                'memory.total': memory.total // MB if memory else 0,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Esempio n. 6
0
def get_infos():
    """Get all information about all your graphics cards.

    Returns:
        dict: The returned result is a dict with 3 keys: count, driver_version and devices:
            count: Number of gpus found
            driver_version: The version of the system’s graphics driver
            devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. 
                     It should be noted that the Process field is also a namedtuple which has 11 fields.
    """

    infos = {}
    Device = namedtuple(
        "Device",
        [
            "id",
            "name",
            "free",
            "used",
            "total",
            "temperature",
            "fan_speed",
            "power_usage",
            "power_state",
            "process",
        ],
    )
    Process = namedtuple(
        "Process",
        [
            "pid",
            "memory_percent",
            "status",
            "username",
            "num_threads",
            "cpu_num",
            "cpu_percent",
            "name",
            "cmdline",
            "used_gpu_mem",
            "create_time",
        ],
    )
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        power_usage = pynvml.nvmlDeviceGetPowerUsage(
            handle)  # Power usage in milliwatts mW
        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
            handle)  # Which processes are using the GPU
        # process_info = [(item.pid, item.usedGpuMemory) for item in process_info]
        process_info = []
        for p in processes:
            # append Process object to process_info
            pid = p.pid
            used_gpu_mem = p.usedGpuMemory
            p = psutil.Process(pid=pid)
            _ = p.cpu_percent()
            time.sleep(0.05)
            process_info.append(
                Process(
                    pid=pid,
                    memory_percent=p.memory_percent(),
                    status=p.status(),
                    username=p.username(),
                    num_threads=p.num_threads(),
                    cpu_num=p.cpu_num(),
                    cpu_percent=p.cpu_percent(),
                    name=p.name(),
                    cmdline=" ".join(p.cmdline()),
                    used_gpu_mem=used_gpu_mem,
                    create_time=p.create_time(),
                ))
        try:
            fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported as e:
            fan_speed = None
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_state = pynvml.nvmlDeviceGetPowerState(handle)
        temperature = pynvml.nvmlDeviceGetTemperature(
            handle, pynvml.NVML_TEMPERATURE_GPU)
        devices.append(
            Device(
                id=i,
                name=name,
                free=mem_info.free,
                used=mem_info.used,
                total=mem_info.total,
                temperature=temperature,
                fan_speed=fan_speed,
                power_usage=power_usage,
                power_state=power_state,
                process=process_info,
            ))

    infos["count"] = device_count
    infos["driver_version"] = driver_version
    infos["devices"] = devices
    return infos
Esempio n. 7
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_last_used(index):
                last_useds = []
                if not os.path.exists('gpu_history.pkl'):
                    pickle.dump({}, open('gpu_history.pkl', 'wb'))
                with open('gpu_history.pkl', 'rb') as f:
                    history = pickle.load(f)
                    if platform.node() in history:
                        for user, last_used in history[
                                platform.node()][index].items():
                            # 1 day = 24 hours, 1 hour = 3600 seconds
                            used_before = (datetime.now() - last_used['last_used']).days * 24 + \
                                          (datetime.now() - last_used['last_used']).seconds / 3600
                            last_useds.append((user, used_before))
                        return last_useds
                    else:
                        return []

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            last_used = get_last_used(index)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
                'last_used':
                last_used,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Esempio n. 8
0
def log_system(log_file, process_pids=None):
    """
    Logs system utilization metrics to log file
    """
    # log cpu util
    cpu_util = psutil.cpu_percent()
    cpu_util_ind = psutil.cpu_percent(percpu=True)
    ts = time.time()
    key = "INFO"
    message = "CPU util: {}% -- Individual utils 1-24: {}".format(
        cpu_util, cpu_util_ind[:24])
    write_to_log(log_file, (ts, key, message))
    message = "CPU util: {}% -- Individual utils 25-48: {}".format(
        cpu_util, cpu_util_ind[24:])
    write_to_log(log_file, (ts, key, message))

    # log GPU util and memory
    try:
        max_gpu_util = 0
        deviceCount = pynvml.nvmlDeviceGetCount()
        for idx in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
            board_num = pynvml.nvmlDeviceGetBoardId(handle)
            name = "GPU {}: {}  (ID {})".format(
                idx,
                pynvml.nvmlDeviceGetName(handle).decode("utf-8"), board_num)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            fan_util = pynvml.nvmlDeviceGetFanSpeed(handle)
            pcie_counter = pynvml.nvmlDeviceGetPcieReplayCounter(handle)
            pcie_util = pynvml.nvmlDeviceGetPcieThroughput(
                handle, pcie_counter)
            gpu_util = util.gpu
            mem_util = util.memory

            message = "{}: Kernel:{}%  Mem:{}% Fan:{}% PCIe: {}MB/s".format(
                name, gpu_util, mem_util, fan_util, round(pcie_util / 1000, 1))
            ts = time.time()
            key = "INFO"
            write_to_log(log_file, (ts, key, message))

            if gpu_util > max_gpu_util:
                max_gpu_util = gpu_util

    except pynvml.NVMLError as error:
        print(error)

    # log memory util
    mem_util = psutil.virtual_memory()
    used = round(mem_util.used / 1e+9, 2)
    total = round(mem_util.total / 1e+9, 2)
    ts = time.time()
    key = "INFO"
    message = "Memory util: {}%  ({}/{}GB)".format(
        round(used / total * 100, 2), used, total)
    write_to_log(log_file, (ts, key, message))

    pid_statuses = []
    warning = False
    if process_pids is not None:
        for key in process_pids:
            pid = process_pids[key]

            try:
                os.kill(pid, 0)
                RUNNING = "running"
            except OSError:
                RUNNING = "stopped"
                warning = True

            pid_statuses.append("{} ({}): {}\n".format(key, pid, RUNNING))

        ts = time.time()
        key = "INFO"
        if warning:
            key = "WARNING"
        write_to_log(log_file, (ts, key, pid_statuses))

    last_log_time = time.time()
    return last_log_time, max_gpu_util
Esempio n. 9
0
File: wgpu.py Progetto: Spiess/wgpu
def get_gpu_pid_info():
    """Retrieves the process IDs of processes running on the GPU."""

    gpus = []
    device_count = -1

    try:
        nvmlInit()

        device_count = nvmlDeviceGetCount()

        gpus = [{}] * device_count

        for i in range(device_count):
            gpus[i] = {'id': i}
            handle = nvmlDeviceGetHandleByIndex(i)
            device_name = nvmlDeviceGetName(handle)

            gpus[i]['name'] = device_name

            try:
                util = nvmlDeviceGetUtilizationRates(handle)
                gpus[i]['utilization'] = util.gpu
            except NVMLError as err:
                print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                mem_info = nvmlDeviceGetMemoryInfo(handle)
                gpus[i]['mem_total'] = mem_info.total
                gpus[i]['mem_used'] = mem_info.used
            except NVMLError as err:
                print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                fan_speed = nvmlDeviceGetFanSpeed(handle)
                gpus[i]['fan_speed'] = fan_speed
            except NVMLError as err:
                print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr)

            try:
                temp = nvmlDeviceGetTemperature(handle, 0)
                gpus[i]['temp'] = temp
            except NVMLError as err:
                print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr)

            try:
                power_usage = nvmlDeviceGetPowerUsage(handle)
                gpus[i]['power_usage'] = round(power_usage / 1000.)
            except NVMLError as err:
                print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr)

            try:
                power_limit = nvmlDeviceGetEnforcedPowerLimit(handle)
                gpus[i]['power_limit'] = round(power_limit / 1000.)
            except NVMLError as err:
                print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr)

            gpus[i]['processes'] = []

            try:
                processes = nvmlDeviceGetComputeRunningProcesses(handle)

                for process in processes:
                    process_name = nvmlSystemGetProcessName(process.pid).decode()
                    gpus[i]['processes'].append({'pid': process.pid, 'name': process_name})

            except NVMLError as err:
                print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr)

    except NVMLError as err:
        print(f'Error while reading GPU information: {err}', file=sys.stderr)

    nvmlShutdown()

    return gpus, device_count
Esempio n. 10
0
deviceCount = pynvml.nvmlDeviceGetCount()
print('  共 %s 块 GPU,名称为:'%deviceCount)
for i in range(deviceCount):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    print("    GPU", i, ":", pynvml.nvmlDeviceGetName(handle))
print('--------------')
for i in range(deviceCount):
    print('查看第 %s 块GPU的显存、温度、风扇、电源: '%i)
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print("Memory Total: %0.2f G"%(info.total/1024/1024/1024)) # 总的显存大小
    print("Memory Free: %0.2f G "%(info.free/1024/1024/1024)) # 剩余显存大小
    print("Memory Used: %0.2f G "%(info.used/1024/1024/1024)) 
    print("Memory Used percent: %0.2f %% "%(info.used/info.total*100))
    print("Temperature is %d C"%(pynvml.nvmlDeviceGetTemperature(handle,0)))
    print("Fan speed is ",pynvml.nvmlDeviceGetFanSpeed(handle))
    print("Power ststus",pynvml.nvmlDeviceGetPowerState(handle))
    print('--------------')

#最后要关闭管理工具
pynvml.nvmlShutdown()

#%% -----------------     os  ----------------------
os.listdir(r'c:\windows')
os.getcwd() # 当前工作目录
os.chdir('C:\Users\Python_Folder') # 改变工作目录到dirname
os.curdir # 返回当前工作目录
os.__file__  # D:\envs\py27\lib\os.pyc
os.rename("python26","python21") 
shutil.move("python21","python20") 
Esempio n. 11
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        # Number of active GPUs
        self.gauge('nvml.gpus.number', deviceCount)
        for device_id in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % int(util_encoder[0]))
                self.gauge('nvml.util.encoder', int(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % int(util_decoder[0]))
                self.gauge('nvml.util.decoder', int(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
            # Clocks throttling info
            # Divide by the mask so that the value is either 0 or 1 per GPU
            try:
                throttle_reasons = (
                    pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle))
                self.gauge('nvml.throttle.appsettings', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) /
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting,
                    tags=d_tags)
                self.gauge('nvml.throttle.display', (throttle_reasons &
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) /
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS,
                    tags=d_tags)
                self.gauge('nvml.throttle.hardware', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonHwSlowdown) /
                    pynvml.nvmlClocksThrottleReasonHwSlowdown,
                    tags=d_tags)
                self.gauge('nvml.throttle.idle', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonGpuIdle) /
                    pynvml.nvmlClocksThrottleReasonGpuIdle,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.hardware', (throttle_reasons &
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.software', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonSwPowerCap) /
                    pynvml.nvmlClocksThrottleReasonSwPowerCap,
                    tags=d_tags)
                self.gauge('nvml.throttle.syncboost', (throttle_reasons &
                    GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.hardware', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.software', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.unknown', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonUnknown) /
                    pynvml.nvmlClocksThrottleReasonUnknown,
                    tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = ','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = 'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Esempio n. 12
0
     pid,
     p.memory_full_info().pss / 1024. / 1024. / 1024.))
 #################查看进程资源情况################################################
 #################查看GPU资源情况#################################################
 pynvml.nvmlInit()
 deviceCount = pynvml.nvmlDeviceGetCount()
 for i in range(deviceCount):
     # 这里的i是GPU id
     handle = pynvml.nvmlDeviceGetHandleByIndex(i)
     print("GPU %d name %s" % (i, pynvml.nvmlDeviceGetName(handle)))
     print("GPU %d Driver %s" %
           (i, pynvml.nvmlSystemGetDriverVersion()))  # 显示驱动信息
     meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
     print("GPU %d mem info total : %.3f GByte" %
           (i, meminfo.total / 1024. / 1024. / 1024.))
     print("GPU %d mem info used : %.3f MByte" %
           (i, meminfo.used / 1024. / 1024.))
     print("GPU %d mem info free : %.3f MByte" %
           (i, meminfo.free / 1024. / 1024.))
     print("Temperature is %d℃" %
           pynvml.nvmlDeviceGetTemperature(handle, 0))
     print("Fan speed is %d%%" % pynvml.nvmlDeviceGetFanSpeed(handle))
     print("Power ststus P%d" % pynvml.nvmlDeviceGetPowerState(handle))
     print("Power ststus %.1fW" %
           (pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0))
     print("nvmlSystemGetProcessName is %s" %
           pynvml.nvmlSystemGetProcessName(pid))  # /usr/bin/python
     # print("nvmlSystemGetProcessName is %s" % pynvml.nvmlDeviceGetAccountingStats(handle, pid))
 # 最后要关闭管理工具
 pynvml.nvmlShutdown()
 #################查看GPU资源情况#################################################
Esempio n. 13
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Esempio n. 14
0
        # decoder usage
        utilization, samplingPeriodUs = nvmlDeviceGetDecoderUtilization(handle)
        info['DEC Util'] = getBar(utilization)

        # power state
        power_used = nvmlDeviceGetPowerUsage(handle) / 1000
        power_limit = nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000
        power_used = int(power_used)
        power_limit = int(power_limit)
        power_rate = int(power_used / power_limit * 100)
        msg = pack_msg([power_used, power_limit], 'W')
        info['Power Util'] = getBar(power_rate, msg)

        # fan speed, temperature
        fan_speed = nvmlDeviceGetFanSpeed(handle)
        temp = nvmlDeviceGetTemperature(handle, 0)
        msg = f"{temp}C"
        info['Fan Speed'] = getBar(fan_speed, msg)

        message = [f"{k} \t{v}" for k, v in info.items()]
        print('\n'.join(message))

        # graphic processes
        graphic_processes = nvmlDeviceGetGraphicsRunningProcesses(handle)
        header = "\n=== Graphic Processes ==="
        show_process(header, graphic_processes)

        # graphic processes
        compute_processes = nvmlDeviceGetComputeRunningProcesses(handle)
        header = "\n=== Compute Processes ==="
Esempio n. 15
0
def _get_full_status_nvml():
    devices_status = []
    devices_full_status = []
    for handle in _static_info['private']['gpu']['handles']:
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        devices_status.append({
            'utilization': {
                'gpu': util.gpu,
                'memory': util.memory
            },
            'memory': {
                'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0
            },
            'processes': len(process_info)
        })
        with _process_info_lock:
            process_list = []
            for p in process_info:
                info = _process_info[p.pid]
                info['gpu_memory'] = p.usedGpuMemory
                process_list.append(info)
        process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True)
        full_status = {
            'memory': {
                'free': mem_info.free,
                'used': mem_info.used
            },
            'process_list': process_list
        }
        try:
            full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['temperature'] = pynvml.nvmlDeviceGetTemperature(
                handle, pynvml.NVML_TEMPERATURE_GPU)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState(
                handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['power'] = {
                'usage': pynvml.nvmlDeviceGetPowerUsage(handle),
                'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle)
            }
        except pynvml.NVMLError_NotSupported:
            pass
        devices_full_status.append(full_status)
    status = {
        'basic': {
            'devices': devices_status
        },
        'full': {
            'devices': devices_full_status
        }
    }
    return status
Esempio n. 16
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info
Esempio n. 17
0
def get_fan_speed(handle):
    try:
        return pynvml.nvmlDeviceGetFanSpeed(handle)
    except pynvml.NVMLError_NotSupported:
        return None
Esempio n. 18
0
    def _get_data(self):
        data = {}

        if self.deviceCount:
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)
                brand = pynvml.nvmlDeviceGetBrand(handle)

                ### Get data ###
                ## Memory usage
                try:
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                except Exception as e:
                    self.debug(str(e))
                    mem = None

                ## ECC errors
                try:
                    _memError = {}
                    _eccCounter = {}
                    eccErrors = {}
                    eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
                    memErrorType = [
                        'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'
                    ]
                    memoryLocationType = [
                        'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY',
                        'REGISTER_FILE', 'TEXTURE_MEMORY'
                    ]
                    for memoryLocation in range(5):
                        for eccCounter in range(2):
                            for memError in range(2):
                                _memError[memErrorType[
                                    memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(
                                        handle, memError, eccCounter,
                                        memoryLocation)
                            _eccCounter[eccCounterType[eccCounter]] = _memError
                        eccErrors[
                            memoryLocationType[memoryLocation]] = _eccCounter
                except Exception as e:
                    self.debug(str(e))
                    eccErrors = None

                ## Temperature
                try:
                    temp = pynvml.nvmlDeviceGetTemperature(
                        handle, pynvml.NVML_TEMPERATURE_GPU)
                except Exception as e:
                    self.debug(str(e))
                    temp = None

                ## Fan
                try:
                    fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
                except Exception as e:
                    self.debug(str(e))
                    fanspeed = None

                ## GPU and Memory Utilization
                try:
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_util = util.gpu
                    mem_util = util.memory
                except Exception as e:
                    self.debug(str(e))
                    gpu_util = None
                    mem_util = None

                ## Encoder Utilization
                try:
                    encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                    enc_util = encoder[0]
                except Exception as e:
                    self.debug(str(e))
                    enc_util = None

                ## Decoder Utilization
                try:
                    decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                    dec_util = decoder[0]
                except Exception as e:
                    self.debug(str(e))
                    dec_util = None

                ## Clock frequencies
                try:
                    clock_core = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_GRAPHICS)
                    clock_sm = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_SM)
                    clock_mem = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
                except Exception as e:
                    self.debug(str(e))
                    clock_core = None
                    clock_sm = None
                    clock_mem = None

                ### Packing data ###
                self.debug("Device", gpuIdx, ":", str(name))
                data["device_name_" + gpuIdx] = name

                self.debug("Brand:", str(brand))

                self.debug(str(name), "Temp      :", str(temp))
                data["device_temp_" + gpuIdx] = temp

                self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
                data["device_mem_total_" + gpuIdx] = mem.total

                self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
                data["device_mem_used_" + gpuIdx] = mem.used

                self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
                data["device_mem_free_" + gpuIdx] = mem.free

                self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
                data["device_load_gpu_" + gpuIdx] = gpu_util

                self.debug(str(name), "Load MEM  :", str(mem_util), '%')
                data["device_load_mem_" + gpuIdx] = mem_util

                self.debug(str(name), "Load ENC  :", str(enc_util), '%')
                data["device_load_enc_" + gpuIdx] = enc_util

                self.debug(str(name), "Load DEC  :", str(dec_util), '%')
                data["device_load_dec_" + gpuIdx] = dec_util

                self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
                data["device_core_clock_" + gpuIdx] = clock_core

                self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
                data["device_sm_clock_" + gpuIdx] = clock_sm

                self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
                data["device_mem_clock_" + gpuIdx] = clock_mem

                self.debug(str(name), "Fan speed :", str(fanspeed), '%')
                data["device_fanspeed_" + gpuIdx] = fanspeed

                self.debug(str(name), "ECC errors:", str(eccErrors))
                if eccErrors is not None:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                else:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = None

        ## Get unit (S-class Nvidia cards) data
        if self.unitCount:
            for i in range(self.unitCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlUnitGetHandleByIndex(i)

                try:
                    fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
                    fan_speed = fan.speed  # Fan speed (RPM)
                    fan_state = fan.state  # Flag that indicates whether fan is working properly
                except Exception as e:
                    self.debug(str(e))
                    fan_speed = None
                    fan_state = None

                try:
                    psu = pynvml.nvmlUnitGetPsuInfo(handle)
                    psu_current = psu.current  # PSU current (A)
                    psu_power = psu.power  # PSU power draw (W)
                    psu_state = psu.state  # The power supply state
                    psu_voltage = psu.voltage  # PSU voltage (V)
                except Exception as e:
                    self.debug(str(e))
                    psu_current = None
                    psu_power = None
                    psu_state = None
                    psu_voltage = None

                try:
                    temp_intake = pynvml.nvmlUnitGetTemperature(
                        handle, 0)  # Temperature at intake in C
                    temp_exhaust = pynvml.nvmlUnitGetTemperature(
                        handle, 1)  # Temperature at exhaust in C
                    temp_board = pynvml.nvmlUnitGetTemperature(
                        handle, 2)  # Temperature on board in C
                except Exception as e:
                    self.debug(str(e))
                    temp_intake = None
                    temp_exhaust = None
                    temp_board = None

                self.debug('Unit fan speed:', str(fan_speed))
                data["unit_fan_speed_" + gpuIdx] = fan_speed

                self.debug('Unit fan state:', str(fan_state))
                data["unit_fan_state_" + gpuIdx] = fan_state

                self.debug('Unit PSU current:', str(psu_current))
                data["unit_psu_current_" + gpuIdx] = psu_current

                self.debug('Unit PSU power:', str(psu_power))
                data["unit_psu_power_" + gpuIdx] = psu_power

                self.debug('Unit PSU state:', str(psu_state))
                data["unit_psu_state_" + gpuIdx] = psu_state

                self.debug('Unit PSU voltage:', str(psu_voltage))
                data["unit_psu_voltage_" + gpuIdx] = psu_voltage

                self.debug('Unit temp intake:', str(temp_intake))
                data["unit_temp_intake_" + gpuIdx] = temp_intake

                self.debug('Unit temp exhaust:', str(temp_exhaust))
                data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

                self.debug('Unit temp board:', str(temp_board))
                data["unit_temp_board_" + gpuIdx] = temp_board

        ## Get data via legacy mode
        if self.legacy:
            try:
                output, error = Popen([
                    "nvidia-settings", "-c", ":0", "-q", "GPUUtilization",
                    "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q",
                    "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory"
                ],
                                      shell=False,
                                      stdout=PIPE,
                                      stderr=PIPE).communicate()
                output = repr(str(output))
                if len(output) < 800:
                    raise Exception(
                        'Error in fetching data from nvidia-settings ' +
                        output)
                self.debug(str(error), output)
            except Exception as e:
                self.error(str(e))
                self.error('Setting legacy mode to False')
                self.legacy = False
                return data
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                if data["device_temp_" + gpuIdx] is None:
                    coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)',
                                       output)[i][1]
                    try:
                        data["device_temp_" + gpuIdx] = int(coreTemp)
                        self.debug('Using legacy temp for GPU {0}: {1}'.format(
                            gpuIdx, coreTemp))
                    except Exception as e:
                        self.debug(str(e), "skipping device_temp_" + gpuIdx)
                if data["device_mem_used_" + gpuIdx] is None:
                    memUsed = findall(
                        'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)',
                        output)[i][1]
                    try:
                        data["device_mem_used_" + gpuIdx] = int(memUsed)
                        self.debug(
                            'Using legacy mem_used for GPU {0}: {1}'.format(
                                gpuIdx, memUsed))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_used_" + gpuIdx)
                if data["device_load_gpu_" + gpuIdx] is None:
                    gpu_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][1]
                    try:
                        data["device_load_gpu_" + gpuIdx] = int(gpu_util)
                        self.debug(
                            'Using legacy load_gpu for GPU {0}: {1}'.format(
                                gpuIdx, gpu_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_gpu_" + gpuIdx)
                if data["device_load_mem_" + gpuIdx] is None:
                    mem_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][2]
                    try:
                        data["device_load_mem_" + gpuIdx] = int(mem_util)
                        self.debug(
                            'Using legacy load_mem for GPU {0}: {1}'.format(
                                gpuIdx, mem_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_mem_" + gpuIdx)
                if data["device_core_clock_" + gpuIdx] is None:
                    clock_core = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][1]
                    try:
                        data["device_core_clock_" + gpuIdx] = int(clock_core)
                        self.debug(
                            'Using legacy core_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_core))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_core_clock_" + gpuIdx)
                if data["device_mem_clock_" + gpuIdx] is None:
                    clock_mem = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][2]
                    try:
                        data["device_mem_clock_" + gpuIdx] = int(clock_mem)
                        self.debug(
                            'Using legacy mem_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_mem))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_clock_" + gpuIdx)

        return data
Esempio n. 19
0
 def _get_fan_speed_percent(gpu):
     return {'fan_speed_percent': pynvml.nvmlDeviceGetFanSpeed(gpu)}