Ejemplo n.º 1
0
    def _get_data(self):
        data = {}

        if self.deviceCount:
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)
                brand = pynvml.nvmlDeviceGetBrand(handle)

                ### Get data ###
                ## Memory usage
                try:
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                except Exception as e:
                    self.debug(str(e))
                    mem = None

                ## ECC errors
                try:
                    _memError = {}
                    _eccCounter = {}
                    eccErrors = {}
                    eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
                    memErrorType = [
                        'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'
                    ]
                    memoryLocationType = [
                        'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY',
                        'REGISTER_FILE', 'TEXTURE_MEMORY'
                    ]
                    for memoryLocation in range(5):
                        for eccCounter in range(2):
                            for memError in range(2):
                                _memError[memErrorType[
                                    memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(
                                        handle, memError, eccCounter,
                                        memoryLocation)
                            _eccCounter[eccCounterType[eccCounter]] = _memError
                        eccErrors[
                            memoryLocationType[memoryLocation]] = _eccCounter
                except Exception as e:
                    self.debug(str(e))
                    eccErrors = None

                ## Temperature
                try:
                    temp = pynvml.nvmlDeviceGetTemperature(
                        handle, pynvml.NVML_TEMPERATURE_GPU)
                except Exception as e:
                    self.debug(str(e))
                    temp = None

                ## Fan
                try:
                    fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
                except Exception as e:
                    self.debug(str(e))
                    fanspeed = None

                ## GPU and Memory Utilization
                try:
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_util = util.gpu
                    mem_util = util.memory
                except Exception as e:
                    self.debug(str(e))
                    gpu_util = None
                    mem_util = None

                ## Encoder Utilization
                try:
                    encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                    enc_util = encoder[0]
                except Exception as e:
                    self.debug(str(e))
                    enc_util = None

                ## Decoder Utilization
                try:
                    decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                    dec_util = decoder[0]
                except Exception as e:
                    self.debug(str(e))
                    dec_util = None

                ## Clock frequencies
                try:
                    clock_core = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_GRAPHICS)
                    clock_sm = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_SM)
                    clock_mem = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
                except Exception as e:
                    self.debug(str(e))
                    clock_core = None
                    clock_sm = None
                    clock_mem = None

                ### Packing data ###
                self.debug("Device", gpuIdx, ":", str(name))
                data["device_name_" + gpuIdx] = name

                self.debug("Brand:", str(brand))

                self.debug(str(name), "Temp      :", str(temp))
                data["device_temp_" + gpuIdx] = temp

                self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
                data["device_mem_total_" + gpuIdx] = mem.total

                self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
                data["device_mem_used_" + gpuIdx] = mem.used

                self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
                data["device_mem_free_" + gpuIdx] = mem.free

                self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
                data["device_load_gpu_" + gpuIdx] = gpu_util

                self.debug(str(name), "Load MEM  :", str(mem_util), '%')
                data["device_load_mem_" + gpuIdx] = mem_util

                self.debug(str(name), "Load ENC  :", str(enc_util), '%')
                data["device_load_enc_" + gpuIdx] = enc_util

                self.debug(str(name), "Load DEC  :", str(dec_util), '%')
                data["device_load_dec_" + gpuIdx] = dec_util

                self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
                data["device_core_clock_" + gpuIdx] = clock_core

                self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
                data["device_sm_clock_" + gpuIdx] = clock_sm

                self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
                data["device_mem_clock_" + gpuIdx] = clock_mem

                self.debug(str(name), "Fan speed :", str(fanspeed), '%')
                data["device_fanspeed_" + gpuIdx] = fanspeed

                self.debug(str(name), "ECC errors:", str(eccErrors))
                if eccErrors is not None:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                else:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = None

        ## Get unit (S-class Nvidia cards) data
        if self.unitCount:
            for i in range(self.unitCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlUnitGetHandleByIndex(i)

                try:
                    fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
                    fan_speed = fan.speed  # Fan speed (RPM)
                    fan_state = fan.state  # Flag that indicates whether fan is working properly
                except Exception as e:
                    self.debug(str(e))
                    fan_speed = None
                    fan_state = None

                try:
                    psu = pynvml.nvmlUnitGetPsuInfo(handle)
                    psu_current = psu.current  # PSU current (A)
                    psu_power = psu.power  # PSU power draw (W)
                    psu_state = psu.state  # The power supply state
                    psu_voltage = psu.voltage  # PSU voltage (V)
                except Exception as e:
                    self.debug(str(e))
                    psu_current = None
                    psu_power = None
                    psu_state = None
                    psu_voltage = None

                try:
                    temp_intake = pynvml.nvmlUnitGetTemperature(
                        handle, 0)  # Temperature at intake in C
                    temp_exhaust = pynvml.nvmlUnitGetTemperature(
                        handle, 1)  # Temperature at exhaust in C
                    temp_board = pynvml.nvmlUnitGetTemperature(
                        handle, 2)  # Temperature on board in C
                except Exception as e:
                    self.debug(str(e))
                    temp_intake = None
                    temp_exhaust = None
                    temp_board = None

                self.debug('Unit fan speed:', str(fan_speed))
                data["unit_fan_speed_" + gpuIdx] = fan_speed

                self.debug('Unit fan state:', str(fan_state))
                data["unit_fan_state_" + gpuIdx] = fan_state

                self.debug('Unit PSU current:', str(psu_current))
                data["unit_psu_current_" + gpuIdx] = psu_current

                self.debug('Unit PSU power:', str(psu_power))
                data["unit_psu_power_" + gpuIdx] = psu_power

                self.debug('Unit PSU state:', str(psu_state))
                data["unit_psu_state_" + gpuIdx] = psu_state

                self.debug('Unit PSU voltage:', str(psu_voltage))
                data["unit_psu_voltage_" + gpuIdx] = psu_voltage

                self.debug('Unit temp intake:', str(temp_intake))
                data["unit_temp_intake_" + gpuIdx] = temp_intake

                self.debug('Unit temp exhaust:', str(temp_exhaust))
                data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

                self.debug('Unit temp board:', str(temp_board))
                data["unit_temp_board_" + gpuIdx] = temp_board

        ## Get data via legacy mode
        if self.legacy:
            try:
                output, error = Popen([
                    "nvidia-settings", "-c", ":0", "-q", "GPUUtilization",
                    "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q",
                    "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory"
                ],
                                      shell=False,
                                      stdout=PIPE,
                                      stderr=PIPE).communicate()
                output = repr(str(output))
                if len(output) < 800:
                    raise Exception(
                        'Error in fetching data from nvidia-settings ' +
                        output)
                self.debug(str(error), output)
            except Exception as e:
                self.error(str(e))
                self.error('Setting legacy mode to False')
                self.legacy = False
                return data
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                if data["device_temp_" + gpuIdx] is None:
                    coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)',
                                       output)[i][1]
                    try:
                        data["device_temp_" + gpuIdx] = int(coreTemp)
                        self.debug('Using legacy temp for GPU {0}: {1}'.format(
                            gpuIdx, coreTemp))
                    except Exception as e:
                        self.debug(str(e), "skipping device_temp_" + gpuIdx)
                if data["device_mem_used_" + gpuIdx] is None:
                    memUsed = findall(
                        'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)',
                        output)[i][1]
                    try:
                        data["device_mem_used_" + gpuIdx] = int(memUsed)
                        self.debug(
                            'Using legacy mem_used for GPU {0}: {1}'.format(
                                gpuIdx, memUsed))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_used_" + gpuIdx)
                if data["device_load_gpu_" + gpuIdx] is None:
                    gpu_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][1]
                    try:
                        data["device_load_gpu_" + gpuIdx] = int(gpu_util)
                        self.debug(
                            'Using legacy load_gpu for GPU {0}: {1}'.format(
                                gpuIdx, gpu_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_gpu_" + gpuIdx)
                if data["device_load_mem_" + gpuIdx] is None:
                    mem_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][2]
                    try:
                        data["device_load_mem_" + gpuIdx] = int(mem_util)
                        self.debug(
                            'Using legacy load_mem for GPU {0}: {1}'.format(
                                gpuIdx, mem_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_mem_" + gpuIdx)
                if data["device_core_clock_" + gpuIdx] is None:
                    clock_core = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][1]
                    try:
                        data["device_core_clock_" + gpuIdx] = int(clock_core)
                        self.debug(
                            'Using legacy core_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_core))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_core_clock_" + gpuIdx)
                if data["device_mem_clock_" + gpuIdx] is None:
                    clock_mem = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][2]
                    try:
                        data["device_mem_clock_" + gpuIdx] = int(clock_mem)
                        self.debug(
                            'Using legacy mem_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_mem))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_clock_" + gpuIdx)

        return data
Ejemplo n.º 2
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = (nv_process.usedGpuMemory // MB if
                           nv_process.usedGpuMemory else None)
                process['gpu_memory_usage'] = usedmem
                # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem)
                process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = "%d MiB" % (
                #     round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['cpu_memory_usage'] = (
                    round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    try:
                        process['cpu_percent'] = cache_process.cpu_percent()
                    except psutil.NoSuchProcess:
                        process['cpu_percent'] = 0.0
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        process['cpu_percent'] = 0.0
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else 0,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else 0,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else 0,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else 0,
                'memory.total': memory.total // MB if memory else 0,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Ejemplo n.º 3
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        gpus_in_use = 0
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
                gpus_in_use += 1 if util_rate.memory > 50.0 else 0
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                self.gauge('nvml.process.count', len(cps), d_tags)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags['puser'] = self.get_process_owner(ps.pid)
                    docker_name, docker_image = self.get_container_name(ps.pid)
                    p_tags['docker_image'] = docker_image
                    p_tags['docker_name'] = docker_name
                    p_tags = self._dict2list(p_tags)
                    print p_tags
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        self.gauge('nvml.gpus_in_use_count', gpus_in_use)
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Ejemplo n.º 4
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'utilization.enc':
                utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                utilization_dec[0] if utilization_dec else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Ejemplo n.º 5
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Ejemplo n.º 6
0
 def info_refresh(self):
     
     try:
         stat = open("/proc/stat")
         self.statlines = stat.read().splitlines()[1:-1]
         stat.close()
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         for j in self.statlines[i].split()[1:]: #remove cpu#
            self.total[i]+= int(j)
         self.idle[i] = int(self.statlines[i].split()[4])
     
     for i in range(self.corecount):
         if (self.total[i] - self.prev_total[i]) == 0:
             self.prev_idle[i] = self.idle[i]
             self.prev_total[i] = self.total[i]
             break
         
         self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) )
         self.prev_idle[i] = self.idle[i]
         self.prev_total[i] = self.total[i]
         self.idle[i] = 0
         self.total[i] = 0
     
     for i in range(self.deviceCount):
         
         util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i])
         temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU)
         memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i])
         (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i])
         (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i])
         
         mem_total = memInfo.total / 1024 / 1024
         mem_used = memInfo.used / 1024 / 1024
         
         self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu)
         self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100)
         ########
         self.util_history.append(util.gpu)
         self.util_graph.queue_draw()
         
         self.temp_history.append(temp)
         self.temp_graph.queue_draw()
         ########
         self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory)
         self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100)
         
         self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util)
         self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util)
         self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100)
         self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100)
         
         self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total))
         self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total)
         
         self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp)
         if temp > 100:
            temp = 100
         elif temp < 0:
             temp = 0
         self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100)
         
         
     #--proc--
     procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0])
     
     proc_liststore = Gtk.ListStore(int, str, int)
     
     for p in procs:
         pid = p.pid
         try:
             path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8')
         except:
             self.exit()
         if (p.usedGpuMemory == None):
             mem = 0
         else:
             mem = (p.usedGpuMemory / 1024 / 1024)
         proc_liststore.append([pid, path, mem])
     self.tree.set_model(proc_liststore)
     return True
Ejemplo n.º 7
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        # Number of active GPUs
        self.gauge('nvml.gpus.number', deviceCount)
        for device_id in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % int(util_encoder[0]))
                self.gauge('nvml.util.encoder', int(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % int(util_decoder[0]))
                self.gauge('nvml.util.decoder', int(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
            # Clocks throttling info
            # Divide by the mask so that the value is either 0 or 1 per GPU
            try:
                throttle_reasons = (
                    pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle))
                self.gauge('nvml.throttle.appsettings', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) /
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting,
                    tags=d_tags)
                self.gauge('nvml.throttle.display', (throttle_reasons &
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) /
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS,
                    tags=d_tags)
                self.gauge('nvml.throttle.hardware', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonHwSlowdown) /
                    pynvml.nvmlClocksThrottleReasonHwSlowdown,
                    tags=d_tags)
                self.gauge('nvml.throttle.idle', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonGpuIdle) /
                    pynvml.nvmlClocksThrottleReasonGpuIdle,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.hardware', (throttle_reasons &
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.software', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonSwPowerCap) /
                    pynvml.nvmlClocksThrottleReasonSwPowerCap,
                    tags=d_tags)
                self.gauge('nvml.throttle.syncboost', (throttle_reasons &
                    GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.hardware', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.software', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.unknown', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonUnknown) /
                    pynvml.nvmlClocksThrottleReasonUnknown,
                    tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = ','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = 'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Ejemplo n.º 8
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)