Exemple #1
0
 def _get_clock_info(gpu):
     return {
         'clock_freq_gpu_mhz':
         pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_GRAPHICS),
         'clock_freq_sm_mhz':
         pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_SM),
         'clock_freq_memory_mhz':
         pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_MEM),
         'clock_freq_video_mhz':
         pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO)
     }
Exemple #2
0
def device_status(device_index):
    handle = nv.nvmlDeviceGetHandleByIndex(device_index)
    device_name = nv.nvmlDeviceGetName(handle)
    device_name = device_name.decode('UTF-8')
    nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle)
    utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu
    clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM)
    temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU)
    pids = []
    users = []
    dates = []
    cmd = None
    for nv_proc in nv_procs:
        pid = nv_proc.pid
        pids.append(pid)
        try:
            proc = psutil.Process(pid)
            users.append(proc.username())
            dates.append(proc.create_time())
            if cmd is None:
                cmd = parse_cmd_roughly(proc.cmdline())
        except psutil.NoSuchProcess:
            users.append('?')
    return {
        'type': device_name,
        'is_available': len(pids) == 0,
        'pids': ','.join([str(pid) for pid in pids]),
        'users': ','.join(users),
        'running_since':
        arrow.get(min(dates)).humanize() if len(dates) > 0 else None,
        'utilization': utilization,
        'clock_mhz': clock_mhz,
        'temperature': temperature,
        'cmd': cmd,
    }
Exemple #3
0
    def _get_data(self):
        data = {}

        if self.deviceCount:
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)
                brand = pynvml.nvmlDeviceGetBrand(handle)

                ### Get data ###
                ## Memory usage
                try:
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                except Exception as e:
                    self.debug(str(e))
                    mem = None

                ## ECC errors
                try:
                    _memError = {}
                    _eccCounter = {}
                    eccErrors = {}
                    eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
                    memErrorType = [
                        'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'
                    ]
                    memoryLocationType = [
                        'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY',
                        'REGISTER_FILE', 'TEXTURE_MEMORY'
                    ]
                    for memoryLocation in range(5):
                        for eccCounter in range(2):
                            for memError in range(2):
                                _memError[memErrorType[
                                    memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(
                                        handle, memError, eccCounter,
                                        memoryLocation)
                            _eccCounter[eccCounterType[eccCounter]] = _memError
                        eccErrors[
                            memoryLocationType[memoryLocation]] = _eccCounter
                except Exception as e:
                    self.debug(str(e))
                    eccErrors = None

                ## Temperature
                try:
                    temp = pynvml.nvmlDeviceGetTemperature(
                        handle, pynvml.NVML_TEMPERATURE_GPU)
                except Exception as e:
                    self.debug(str(e))
                    temp = None

                ## Fan
                try:
                    fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
                except Exception as e:
                    self.debug(str(e))
                    fanspeed = None

                ## GPU and Memory Utilization
                try:
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_util = util.gpu
                    mem_util = util.memory
                except Exception as e:
                    self.debug(str(e))
                    gpu_util = None
                    mem_util = None

                ## Encoder Utilization
                try:
                    encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                    enc_util = encoder[0]
                except Exception as e:
                    self.debug(str(e))
                    enc_util = None

                ## Decoder Utilization
                try:
                    decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                    dec_util = decoder[0]
                except Exception as e:
                    self.debug(str(e))
                    dec_util = None

                ## Clock frequencies
                try:
                    clock_core = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_GRAPHICS)
                    clock_sm = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_SM)
                    clock_mem = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
                except Exception as e:
                    self.debug(str(e))
                    clock_core = None
                    clock_sm = None
                    clock_mem = None

                ### Packing data ###
                self.debug("Device", gpuIdx, ":", str(name))
                data["device_name_" + gpuIdx] = name

                self.debug("Brand:", str(brand))

                self.debug(str(name), "Temp      :", str(temp))
                data["device_temp_" + gpuIdx] = temp

                self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
                data["device_mem_total_" + gpuIdx] = mem.total

                self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
                data["device_mem_used_" + gpuIdx] = mem.used

                self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
                data["device_mem_free_" + gpuIdx] = mem.free

                self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
                data["device_load_gpu_" + gpuIdx] = gpu_util

                self.debug(str(name), "Load MEM  :", str(mem_util), '%')
                data["device_load_mem_" + gpuIdx] = mem_util

                self.debug(str(name), "Load ENC  :", str(enc_util), '%')
                data["device_load_enc_" + gpuIdx] = enc_util

                self.debug(str(name), "Load DEC  :", str(dec_util), '%')
                data["device_load_dec_" + gpuIdx] = dec_util

                self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
                data["device_core_clock_" + gpuIdx] = clock_core

                self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
                data["device_sm_clock_" + gpuIdx] = clock_sm

                self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
                data["device_mem_clock_" + gpuIdx] = clock_mem

                self.debug(str(name), "Fan speed :", str(fanspeed), '%')
                data["device_fanspeed_" + gpuIdx] = fanspeed

                self.debug(str(name), "ECC errors:", str(eccErrors))
                if eccErrors is not None:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                else:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = None

        ## Get unit (S-class Nvidia cards) data
        if self.unitCount:
            for i in range(self.unitCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlUnitGetHandleByIndex(i)

                try:
                    fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
                    fan_speed = fan.speed  # Fan speed (RPM)
                    fan_state = fan.state  # Flag that indicates whether fan is working properly
                except Exception as e:
                    self.debug(str(e))
                    fan_speed = None
                    fan_state = None

                try:
                    psu = pynvml.nvmlUnitGetPsuInfo(handle)
                    psu_current = psu.current  # PSU current (A)
                    psu_power = psu.power  # PSU power draw (W)
                    psu_state = psu.state  # The power supply state
                    psu_voltage = psu.voltage  # PSU voltage (V)
                except Exception as e:
                    self.debug(str(e))
                    psu_current = None
                    psu_power = None
                    psu_state = None
                    psu_voltage = None

                try:
                    temp_intake = pynvml.nvmlUnitGetTemperature(
                        handle, 0)  # Temperature at intake in C
                    temp_exhaust = pynvml.nvmlUnitGetTemperature(
                        handle, 1)  # Temperature at exhaust in C
                    temp_board = pynvml.nvmlUnitGetTemperature(
                        handle, 2)  # Temperature on board in C
                except Exception as e:
                    self.debug(str(e))
                    temp_intake = None
                    temp_exhaust = None
                    temp_board = None

                self.debug('Unit fan speed:', str(fan_speed))
                data["unit_fan_speed_" + gpuIdx] = fan_speed

                self.debug('Unit fan state:', str(fan_state))
                data["unit_fan_state_" + gpuIdx] = fan_state

                self.debug('Unit PSU current:', str(psu_current))
                data["unit_psu_current_" + gpuIdx] = psu_current

                self.debug('Unit PSU power:', str(psu_power))
                data["unit_psu_power_" + gpuIdx] = psu_power

                self.debug('Unit PSU state:', str(psu_state))
                data["unit_psu_state_" + gpuIdx] = psu_state

                self.debug('Unit PSU voltage:', str(psu_voltage))
                data["unit_psu_voltage_" + gpuIdx] = psu_voltage

                self.debug('Unit temp intake:', str(temp_intake))
                data["unit_temp_intake_" + gpuIdx] = temp_intake

                self.debug('Unit temp exhaust:', str(temp_exhaust))
                data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

                self.debug('Unit temp board:', str(temp_board))
                data["unit_temp_board_" + gpuIdx] = temp_board

        ## Get data via legacy mode
        if self.legacy:
            try:
                output, error = Popen([
                    "nvidia-settings", "-c", ":0", "-q", "GPUUtilization",
                    "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q",
                    "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory"
                ],
                                      shell=False,
                                      stdout=PIPE,
                                      stderr=PIPE).communicate()
                output = repr(str(output))
                if len(output) < 800:
                    raise Exception(
                        'Error in fetching data from nvidia-settings ' +
                        output)
                self.debug(str(error), output)
            except Exception as e:
                self.error(str(e))
                self.error('Setting legacy mode to False')
                self.legacy = False
                return data
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                if data["device_temp_" + gpuIdx] is None:
                    coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)',
                                       output)[i][1]
                    try:
                        data["device_temp_" + gpuIdx] = int(coreTemp)
                        self.debug('Using legacy temp for GPU {0}: {1}'.format(
                            gpuIdx, coreTemp))
                    except Exception as e:
                        self.debug(str(e), "skipping device_temp_" + gpuIdx)
                if data["device_mem_used_" + gpuIdx] is None:
                    memUsed = findall(
                        'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)',
                        output)[i][1]
                    try:
                        data["device_mem_used_" + gpuIdx] = int(memUsed)
                        self.debug(
                            'Using legacy mem_used for GPU {0}: {1}'.format(
                                gpuIdx, memUsed))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_used_" + gpuIdx)
                if data["device_load_gpu_" + gpuIdx] is None:
                    gpu_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][1]
                    try:
                        data["device_load_gpu_" + gpuIdx] = int(gpu_util)
                        self.debug(
                            'Using legacy load_gpu for GPU {0}: {1}'.format(
                                gpuIdx, gpu_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_gpu_" + gpuIdx)
                if data["device_load_mem_" + gpuIdx] is None:
                    mem_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][2]
                    try:
                        data["device_load_mem_" + gpuIdx] = int(mem_util)
                        self.debug(
                            'Using legacy load_mem for GPU {0}: {1}'.format(
                                gpuIdx, mem_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_mem_" + gpuIdx)
                if data["device_core_clock_" + gpuIdx] is None:
                    clock_core = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][1]
                    try:
                        data["device_core_clock_" + gpuIdx] = int(clock_core)
                        self.debug(
                            'Using legacy core_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_core))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_core_clock_" + gpuIdx)
                if data["device_mem_clock_" + gpuIdx] is None:
                    clock_mem = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][2]
                    try:
                        data["device_mem_clock_" + gpuIdx] = int(clock_mem)
                        self.debug(
                            'Using legacy mem_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_mem))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_clock_" + gpuIdx)

        return data
Exemple #4
0
 def mem_clock(self):
     """Control the graphics clock (may require permission), only values compatible with the graphics clock can be set directly"""
     return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM)
            util = nvmlDeviceGetUtilizationRates(handle)    #检索设备主要子系统的当前利用率
            gpu_util = str(util.gpu)    #gpu利用率
            mem_util = str(util.memory)    #显存利用率
        except pn.NVMLError, err:
	    gpu_util = 'NA'
	    mem_util = 'NA'            
        try:
            temp = pn.nvmlDeviceGetTemperature(handle, pn.NVML_TEMPERATURE_GPU)    #获取GPU当前温度
        except np.NVMLError, err:
	    temp = 'NA'
	try:
            powMan = pn.nvmlDeviceGetPowerManagementMode(handle)    #获取设备当前的电源管理模式
        except pn.NVMLError, err:
            powMan = 'NA'
        try:
            graphics_clock = pn.nvmlDeviceGetClockInfo(handle, pn.NVML_CLOCK_GRAPHICS)    #检索设备的当前时钟速度
        except pn.NVMLError, err:
            graphics_clock = 'NA'
	try:
            mem_clock = pn.nvmlDeviceGetClockInfo(handle, pn.NVML_CLOCK_MEM)
        except np.NVMLError, err:
	    mem_clock = 'NA'
	try:
	    perf_stat = pn.nvmlDeviceGetPowerState(handle)    #检索设备的当前性能状
	except np.NVMLError, err:
	    perf_stat = 'NA'
	tmp_dict['Gpu_Id'] = gpu_id
	tmp_dict['Product_Name'] = product_name
	tmp_dict['Mode'] = mode
	tmp_dict['Current_Driver_Model'] = Current_driver_model
	tmp_dict['Total_Memory'] = mem_total
Exemple #6
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                    nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                mem_clock = N.nvmlDeviceGetClockInfo(handle, 2)
            except N.NVMLError:
                mem_clock = None

            try:
                core_clock = N.nvmlDeviceGetClockInfo(handle, 1)
            except N.NVMLError:
                core_clock = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'mem.clock': mem_clock,
                'core.clock': core_clock,
                'utilization.gpu': utilization.gpu if utilization else None,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemple #7
0
    for i in range(nvmlDeviceGetCount()):
        info = defaultdict()

        driver_version = nvmlSystemGetDriverVersion().decode()
        info['Driver Version'] = driver_version

        info['GPU idx'] = i
        handle = nvmlDeviceGetHandleByIndex(i)

        # device name
        device_name = nvmlDeviceGetName(handle).decode()
        info['GPU Name'] = device_name

        # clock
        clk = nvmlDeviceGetClockInfo(handle, 0)
        max_clk = nvmlDeviceGetMaxClockInfo(handle, 0)
        clk_rate = int(clk / max_clk * 100)
        msg = pack_msg([clk, max_clk], 'MHz')
        info['GPU Clock'] = getBar(clk_rate, msg)

        # utilize
        util = nvmlDeviceGetUtilizationRates(handle)
        # memory
        mem_info = nvmlDeviceGetMemoryInfo(handle)
        mem_used = toMiB(mem_info.used)
        # mem_free = toMiB(info.free)
        mem_total = toMiB(mem_info.total)
        info['GPU Util'] = getBar(util.gpu)
        mem_rate = int(mem_used / mem_total * 100)
        msg = pack_msg([mem_used, mem_total], 'MiB')