Example #1
0
 def pwr_limit(self, new_limit):
     if not self.pwr_constraints[0] <= new_limit <= self.pwr_constraints[1]:
         raise ValueError(
             f"Power limit {new_limit} out of range [{self.pwr_constraints[0]}, {self.pwr_constraints[1]}]"
         )
     pynvml.nvmlDeviceSetPowerManagementLimit(self.dev, new_limit)
     self._pwr_limit = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev)
Example #2
0
    def __init__(self, id=0):
        """Create object to control device using NVML"""

        pynvml.nvmlInit()
        self.dev = pynvml.nvmlDeviceGetHandleByIndex(id)

        try:
            self._pwr_limit = pynvml.nvmlDeviceGetPowerManagementLimit(
                self.dev)
            self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(
                self.dev)
        except pynvml.NVMLError_NotSupported:
            self._pwr_limit = None
            self.pwr_constraints = [
                1, 0
            ]  # inverted range to make all range checks fail

        try:
            self._persistence_mode = pynvml.nvmlDeviceGetPersistenceMode(
                self.dev)
        except pynvml.NVMLError_NotSupported:
            self._persistence_mode = None

        try:
            self._auto_boost = pynvml.nvmlDeviceGetAutoBoostedClocksEnabled(
                self.dev)[0]  # returns [isEnabled, isDefaultEnabled]
        except pynvml.NVMLError_NotSupported:
            self._auto_boost = None

        try:
            self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(
                self.dev, pynvml.NVML_CLOCK_GRAPHICS)
            self.sm_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(
                self.dev, pynvml.NVML_CLOCK_SM)
            self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(
                self.dev, pynvml.NVML_CLOCK_MEM)

            self.supported_mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(
                self.dev)

            #gather the supported gr clocks for each supported mem clock into a dict
            self.supported_gr_clocks = dict()
            for mem_clock in self.supported_mem_clocks:
                supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(
                    self.dev, mem_clock)
                self.supported_gr_clocks[mem_clock] = supported_gr_clocks
        except pynvml.NVMLError_NotSupported:
            self.gr_clock_default = None
            self.sm_clock_default = None
            self.mem_clock_default = None
            self.supported_mem_clocks = []
            self.supported_gr_clocks = dict()
Example #3
0
def get_perf(proc=None, recursive=True, children_pool=None, metrics=None):
    """
    Get process performance metrics
    """
    _initialize_pynvml()

    if metrics is None:
        metrics = OrderedDict()

    metrics['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
    metrics['cpu_total'] = psutil.cpu_count()
    metrics['cpu_used'] = psutil.cpu_percent()
    if proc:
        if recursive:
            percents = []
            _recursive_proc(percents, proc, children_pool,
                            lambda p: p.cpu_percent())
            metrics['proc_count'] = len(percents)
            metrics['cpu_used_proc'] = sum(percents)
        else:
            metrics['cpu_used_proc'] = proc.cpu_percent()

    mem = psutil.virtual_memory()
    metrics['mem_total'] = mem.total
    metrics['mem_used'] = mem.used
    if proc:
        if recursive:
            rss = []
            _recursive_proc(rss, proc, children_pool,
                            lambda p: p.memory_info().rss)
            metrics['mem_used_proc'] = sum(rss)
        else:
            metrics['mem_used_proc'] = proc.memory_info().rss

    for i, h in enumerate(_gpu_devices):
        used = pynvml.nvmlDeviceGetUtilizationRates(h)
        mem = pynvml.nvmlDeviceGetMemoryInfo(h)
        metrics[f'gpu_{i}_used'] = used.gpu
        metrics[f'gpu_{i}_mem_used'] = mem.used  # used.memory
        metrics[f'gpu_{i}_mem_total'] = mem.total
        metrics[f'gpu_{i}_power_used'] = pynvml.nvmlDeviceGetPowerUsage(h)
        metrics[
            f'gpu_{i}_power_total'] = pynvml.nvmlDeviceGetPowerManagementLimit(
                h)

    return metrics
Example #4
0
def _get_full_status_nvml():
    devices_status = []
    devices_full_status = []
    for handle in _static_info['private']['gpu']['handles']:
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        devices_status.append({
            'utilization': {
                'gpu': util.gpu,
                'memory': util.memory
            },
            'memory': {
                'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0
            },
            'processes': len(process_info)
        })
        with _process_info_lock:
            process_list = []
            for p in process_info:
                info = _process_info[p.pid]
                info['gpu_memory'] = p.usedGpuMemory
                process_list.append(info)
        process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True)
        full_status = {
            'memory': {
                'free': mem_info.free,
                'used': mem_info.used
            },
            'process_list': process_list
        }
        try:
            full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['temperature'] = pynvml.nvmlDeviceGetTemperature(
                handle, pynvml.NVML_TEMPERATURE_GPU)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState(
                handle)
        except pynvml.NVMLError_NotSupported:
            pass
        try:
            full_status['power'] = {
                'usage': pynvml.nvmlDeviceGetPowerUsage(handle),
                'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle)
            }
        except pynvml.NVMLError_NotSupported:
            pass
        devices_full_status.append(full_status)
    status = {
        'basic': {
            'devices': devices_status
        },
        'full': {
            'devices': devices_full_status
        }
    }
    return status
Example #5
0
 def _get_power_limit_watts(gpu):
     return {
         'power_limit_watts':
         (pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0)
     }