Esempio n. 1
0
    def get_stats(self):
        """
        Get system statistics and assign to `self`
        """
        memory_usage = psutil.virtual_memory()
        disk_usage = psutil.disk_usage('/')
        # net = psutil.net_io_counters()
        system = {
            # CPU utilization percent(can be over 100%)
            'cpu':
            round10e5(self._process.cpu_percent(0.0)),

            # Whole system memory usage
            # 'memory_used': round10e5(memory_usage.used / 1024 / 1024),
            'memory_percent':
            round10e5(memory_usage.used * 100 / memory_usage.total),

            # Get the portion of memory occupied by a process
            # 'p_memory_rss': round10e5(self._process.memory_info().rss
            #                           / 1024 / 1024),
            'p_memory_percent':
            round10e5(self._process.memory_percent()),

            # Disk usage
            # 'disk_used': round10e5(disk_usage.used / 1024 / 1024),
            'disk_percent':
            round10e5(disk_usage.percent),
        }

        # Collect GPU statistics
        gpus = []
        try:
            gpu_device_count = nvml.nvmlDeviceGetCount()
            for i in range(gpu_device_count):
                handle = nvml.nvmlDeviceGetHandleByIndex(i)
                nvml_tmp = nvml.NVML_TEMPERATURE_GPU

                # Get device memory and temperature
                util = nvml.nvmlDeviceGetUtilizationRates(handle)
                memory = nvml.nvmlDeviceGetMemoryInfo(handle)
                temp = nvml.nvmlDeviceGetTemperature(handle, nvml_tmp)

                # Compute power usage in watts and percent
                power_watts = nvml.nvmlDeviceGetPowerUsage(handle) / 1000
                power_cap = nvml.nvmlDeviceGetEnforcedPowerLimit(handle)
                power_cap_watts = power_cap / 1000
                power_watts / power_cap_watts * 100

                gpus.append({
                    # GPU utilization percent
                    'gpu':
                    round10e5(util.gpu),

                    # Device memory usage
                    # 'memory_used': round10e5(memory.used / 1024 / 1024),
                    'gpu_memory_percent':
                    round10e5(memory.used * 100 / memory.total),

                    # Power usage in watts and percent
                    'gpu_power_watts':
                    round10e5(power_watts),
                    # 'power_percent': round10e5(power_usage),

                    # Device temperature
                    'gpu_temp':
                    round10e5(temp),
                })
        except Exception:
            pass

        return system, gpus
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""

        def get_process_info(nv_process):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=nv_process.pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except:
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except:
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        index = N.nvmlDeviceGetIndex(handle)
        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'temperature.gpu': temperature,
            'utilization.gpu': utilization.gpu if utilization else None,
            'power.draw': int(power / 1000) if power is not None else None,
            'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None,
            # Convert bytes into MBytes
            'memory.used': int(memory.used / 1024 / 1024) if memory else None,
            'memory.total': int(memory.total / 1024 / 1024) if memory else None,
            'processes': processes,
        }
        return gpu_info