Beispiel #1
0
 def measure_gpu_usage(self):
     from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
                          nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
     max_gpu_usage = []
     gpu_name = []
     try:
         nvmlInit()
         deviceCount = nvmlDeviceGetCount()
         max_gpu_usage = [0 for i in range(deviceCount)]
         gpu_name = [
             nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
             for i in range(deviceCount)
         ]
         while True:
             for i in range(deviceCount):
                 info = nvmlDeviceGetMemoryInfo(
                     nvmlDeviceGetHandleByIndex(i))
                 max_gpu_usage[i] = max(max_gpu_usage[i],
                                        info.used / 1024**2)
             sleep(0.005)  # 5ms
             if not self.keep_measuring:
                 break
         nvmlShutdown()
         return [{
             "device_id": i,
             "name": gpu_name[i],
             "max_used_MB": max_gpu_usage[i]
         } for i in range(deviceCount)]
     except NVMLError as error:
         if not self.silent:
             self.logger.error(
                 "Error fetching GPU information using nvml: %s", error)
         return None
Beispiel #2
0
 def __init__(self, handle, cpu_to_node):
     node = None
     # TODO: use number of CPU cores to determine cpuset size
     # This is very hacky at the moment
     affinity = pynvml.nvmlDeviceGetCpuAffinity(handle, 1)
     n_cpus = max(cpu_to_node.keys()) + 1
     for j in range(n_cpus):
         if affinity[0] & (1 << j):
             cur_node = cpu_to_node[j]
             if node is not None and node != cur_node:
                 node = -1  # Sentinel to indicate unknown affinity
             else:
                 node = cur_node
     if node == -1:
         node = None
     self.node = node
     self.mem = pynvml.nvmlDeviceGetMemoryInfo(handle).total
     self.name = pynvml.nvmlDeviceGetName(handle)
     # NVML doesn't report compute capability, so we need CUDA
     pci_bus_id = pynvml.nvmlDeviceGetPciInfo(handle).busId
     # In Python 3 pci_bus_id is bytes but pycuda wants str
     if not isinstance(pci_bus_id, str):
         pci_bus_id = pci_bus_id.decode('ascii')
     cuda_device = pycuda.driver.Device(pci_bus_id)
     self.compute_capability = cuda_device.compute_capability()
     self.device_attributes = {}
     self.uuid = pynvml.nvmlDeviceGetUUID(handle)
     for key, value in cuda_device.get_attributes().items():
         if isinstance(value, (int, float, str)):
             # Some of the attributes use Boost.Python's enum, which is
             # derived from int but which leads to invalid JSON when passed
             # to json.dumps.
             if isinstance(value, int) and type(value) != int:
                 value = str(value)
             self.device_attributes[str(key)] = value
Beispiel #3
0
    def get_gpu_info_by_nvml(self) -> Dict:
        """Get GPU info using nvml"""
        gpu_info_list = []
        driver_version = None
        try:
            nvmlInit()
            driver_version = nvmlSystemGetDriverVersion()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_info = {}
                gpu_info["memory_total"] = info.total
                gpu_info["memory_available"] = info.free
                gpu_info["name"] = nvmlDeviceGetName(handle)
                gpu_info_list.append(gpu_info)
            nvmlShutdown()
        except NVMLError as error:
            if not self.silent:
                self.logger.error(
                    "Error fetching GPU information using nvml: %s", error)
            return None

        result = {"driver_version": driver_version, "devices": gpu_info_list}

        if 'CUDA_VISIBLE_DEVICES' in environ:
            result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES']
        return result
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            if self.framework == "PyTorch":
                info["use_torchscript"] = self.args.torchscript
            if self.framework == "TensorFlow":
                info["eager_mode"] = self.args.eager_mode
                info["use_xla"] = self.args.use_xla
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self.args.fp16
            info["use_multiprocessing"] = self.args.do_multi_processing
            info["only_pretrain_model"] = self.args.only_pretrain_model

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory. "
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self.args.is_gpu
            if self.args.is_gpu:
                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"

            info["use_tpu"] = self.args.is_tpu
            # TODO(PVP): See if we can add more information about TPU
            # see: https://github.com/pytorch/xla/issues/2180

            self._environment_info = info
        return self._environment_info
Beispiel #5
0
    def __init__(self, index: int):
        self.index = index
        self.handle = py3nvml.nvmlDeviceGetHandleByIndex(index)

        self.name = py3nvml.nvmlDeviceGetName(self.handle)

        self.memory = Memory(self.handle)
        self.utilization = Utilization(self.handle)
        self.processes = Processes(self.handle)

        self.update()
    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix
Beispiel #7
0
def gpu_info():
    "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)"
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    info = []
    for i in range(0, deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        util = nvmlDeviceGetUtilizationRates(handle)
        desc = nvmlDeviceGetName(handle)
        info.append(
            (i, desc, util.gpu))  #['GPU %i - %s' % (i, desc)] = util.gpu
    return info
Beispiel #8
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
Beispiel #9
0
        def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]:
            from py3nvml.py3nvml import (
                NVMLError,
                nvmlDeviceGetCount,
                nvmlDeviceGetHandleByIndex,
                nvmlDeviceGetMemoryInfo,
                nvmlDeviceGetName,
                nvmlInit,
                nvmlShutdown,
            )

            max_gpu_usage = []
            gpu_name = []
            try:
                nvmlInit()
                device_count = nvmlDeviceGetCount()
                if not isinstance(device_count, int):
                    logger.error(
                        f"nvmlDeviceGetCount result is not integer: {device_count}"
                    )
                    return None

                max_gpu_usage = [0 for i in range(device_count)]
                gpu_name = [
                    nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
                    for i in range(device_count)
                ]
                while True:
                    for i in range(device_count):
                        info = nvmlDeviceGetMemoryInfo(
                            nvmlDeviceGetHandleByIndex(i))
                        if isinstance(info, str):
                            logger.error(
                                f"nvmlDeviceGetMemoryInfo returns str: {info}")
                            return None
                        max_gpu_usage[i] = max(max_gpu_usage[i],
                                               info.used / 1024**2)
                    sleep(0.005)  # 5ms
                    if not self.keep_measuring:
                        break
                nvmlShutdown()
                return [{
                    "device_id": i,
                    "name": gpu_name[i],
                    "max_used_MB": max_gpu_usage[i],
                } for i in range(device_count)]
            except NVMLError as error:
                logger.error("Error fetching GPU information using nvml: %s",
                             error)
                return None
Beispiel #10
0
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["gluonnlp_version"] = gluonnlp.__version__
            info["framework_version"] = mxnet.__version__
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self._use_fp16

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self._use_gpu
            if self._use_gpu:
                info["num_gpus"] = 1
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
            self._environment_info = info
        return self._environment_info
    def __init__(self):
        self.labels = ['gpu', 'name', 'driver']
        self.driver = nv.nvmlSystemGetDriverVersion()

        self.n_gpu = nv.nvmlDeviceGetCount()
        self.hnds = [
            nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu)
        ]
        self.args = []
        for i, hnd in enumerate(self.hnds):
            args = OrderedDict()
            args['gpu'] = 'gpu%d' % i
            args['name'] = nv.nvmlDeviceGetName(hnd)
            args['driver'] = self.driver
            self.args.append(args)
Beispiel #12
0
def gpu_info(gpu_index: int) -> Tuple[str, int]:
    """Returns a description of a GPU

    Returns the description and memory size of GPU.
    """

    nvml.nvmlInit()

    handle = nvml.nvmlDeviceGetHandleByIndex(gpu_index)
    gpu_desc = nvml.nvmlDeviceGetName(handle)

    #
    # Get memory info.
    #
    mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
    if mem_info != 'N/A':
        mem_total = mem_info.total >> 20
    else:
        mem_total = 0

    return gpu_desc, mem_total
Beispiel #13
0
def get_gpu_info() -> Optional[List[Dict[str, Any]]]:
    from py3nvml.py3nvml import (
        NVMLError,
        nvmlDeviceGetCount,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetMemoryInfo,
        nvmlDeviceGetName,
        nvmlInit,
        nvmlShutdown,
    )

    try:
        nvmlInit()
        result = []
        device_count = nvmlDeviceGetCount()
        if not isinstance(device_count, int):
            return None

        for i in range(device_count):
            info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
            if isinstance(info, str):
                return None
            result.append({
                "id":
                i,
                "name":
                nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)),
                "total":
                info.total,
                "free":
                info.free,
                "used":
                info.used,
            })
        nvmlShutdown()
        return result
    except NVMLError as error:
        print("Error fetching GPU information using nvml: %s", error)
        return None
    def on_epoch_end(self, epoch, logs=None):
        for item in self.report:
            try:
                suffix = handle = None
                for n, handle in enumerate(self.deviceHandles):
                    if len(self.deviceHandles) == 1 and not self.always_suffix:
                        suffix = ''
                    else:
                        suffix = '_%02d' % (
                            n,
                        )  # TODO: this will not work nicely if more than 100 GPUs are in one sys

                    logs[item + suffix] = np.float32(
                        self.reportable_values[item](handle))
            except nvml.NVMLError as err:
                self.output("Error trying to read out value from NVML: %r" %
                            (err, ))
        if self.report and self.verbose_once:
            self.output("Current status for device #% 2d (%s): %r" %
                        (n, nvml.nvmlDeviceGetName(handle), {
                            what: float(call(handle))
                            for what, call in self.reportable_values.items()
                        }))
            self.verbose_once = False  # only print once
    def live(self, callback):
        try:
            with Live(
                    self.layout,
                    refresh_per_second=2,
                    screen=False,
                    redirect_stderr=False,
                    redirect_stdout=False,
            ) as live:
                while True:
                    if self.stop_flag:
                        break

                    if callback:
                        callback(self)

                    if not self.resources_by_endpoint:
                        self.layout["endpoints"].update(
                            Panel(
                                Align.center(
                                    Text(
                                        "Waiting for endpoints to come alive"),
                                    vertical="middle",
                                )))
                    else:
                        self.endpoints_layout["data"].update(
                            EndpointMonitor(self.resources_by_endpoint))

                        self.endpoints_values = []
                        updated_keys = set()
                        max_value = 0
                        for (
                                endpoint_name,
                                endpoint_data,
                        ) in self.resources_by_endpoint.items():
                            updated_keys.add(endpoint_name)  # todo: finish

                            total_used = 0
                            total_max = 0
                            for entry in endpoint_data.values():
                                total_used += entry.used
                                total_max += entry.available

                            max_value = max(max_value, total_max)

                            # self.endpoints_values.append(data)
                            past_entries = self.endpoints_past_values.setdefault(
                                endpoint_name, [])
                            past_entries.append(total_used)

                            self.endpoints_values.append(past_entries)

                        self.endpoints_graph = AsciiGraph(
                            self.endpoints_values, max_value, BACKEND_COLORS)
                        self.endpoints_layout["graph"].update(
                            self.endpoints_graph)

                        self.layout["endpoints"].update(
                            Panel(self.endpoints_layout, title="Endpoints"))

                    uptime = datetime.datetime.now() - self.start_time
                    self.layout["header"]["info"].update(
                        Align.right(
                            Text(f"""Node ID: {self.node_id}
                                Uptime: {humanize.naturaldelta(uptime)}
                                https://discord.gg/94KqBcE"""),
                            vertical="middle",
                        ))

                    titles = []

                    table = Table.grid()
                    table.add_column(style="green")
                    table.add_column(no_wrap=True)

                    self.cpu_usage[0].append(psutil.cpu_percent(interval=None))
                    self.ram_usage[0].append(
                        int(round(psutil.virtual_memory().used / 1024**2)))

                    total_gpus_actual = py3nvml.nvmlDeviceGetCount()
                    for i in range(total_gpus_actual):
                        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
                        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                        utilization_info = py3nvml.nvmlDeviceGetUtilizationRates(
                            handle)

                        table.add_row(
                            py3nvml.nvmlDeviceGetName(handle),
                            str(round(meminfo.used / 1024**2)),
                        )
                        self.gpu_mem_usage[i].append(
                            round(meminfo.used / 1024**2))
                        self.gpu_usage[i].append(utilization_info.gpu)

                        color = RICH_COLORS[i]
                        titles.append(
                            f"[{color}]" + py3nvml.nvmlDeviceGetName(handle) +
                            f" {utilization_info.gpu}%, {humanize.naturalsize(meminfo.used)}/{humanize.naturalsize(meminfo.total)}"
                            + "[/]")

                    self.gpu_layout["utilization"].update(
                        self.gpu_usage_graph, )
                    self.gpu_layout["memory"].update(self.gpu_mem_usage_graph)

                    self.layout["gpu"].update(
                        Panel(self.gpu_layout, title=" ".join(titles)))

                    self.cpu_layout["utilization"].update(
                        Panel(self.cpu_usage_graph))
                    self.cpu_layout["memory"].update(
                        Panel(self.ram_usage_graph))

                    self.cpu_layout["utilization"].update(
                        self.cpu_usage_graph, )
                    self.cpu_layout["memory"].update(self.ram_usage_graph)

                    self.layout["cpu"].update(
                        Panel(self.cpu_layout, title=CPU_NAME))

                    self.layout["console"].update(self.tail)

                    sleep(1.0)
        except KeyboardInterrupt as e:
            py3nvml.nvmlShutdown()
            raise e
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())

            try:
                import psutil
            except (ImportError):
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"
            else:
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)

            info["use_gpu"] = self.is_gpu
            if self.is_gpu:
                info["num_gpus"] = self.args.n_gpu
                try:
                    from py3nvml import py3nvml

                    py3nvml.nvmlInit()
                    handle = py3nvml.nvmlDeviceGetHandleByIndex(
                        self.args.device_idx)
                except ImportError:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
                except (OSError, py3nvml.NVMLError):
                    logger.warning(
                        "Error while initializing comunication with GPU. "
                        "We won't log information about GPU.")
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
                    py3nvml.nvmlShutdown()
                else:
                    info["gpu"] = py3nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        py3nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    py3nvml.nvmlShutdown()

            self._environment_info = info
        return self._environment_info
Beispiel #17
0
    def getGpuInfo(self):
        if (self._impulse % 2) != 0:
            return self._gpuInfoObj

        try:
            N.nvmlInit()
            gpuInfoObj = {}

            driverVersion = N.nvmlSystemGetDriverVersion()
            deviceCnt = N.nvmlDeviceGetCount()

            gpuInfoObj['DRIVER_VERSION'] = driverVersion
            gpuInfoObj['DEVICE_COUNT'] = deviceCnt

            for dCnt in range(deviceCnt):
                deviceInfoObj = {}
                handle = N.nvmlDeviceGetHandleByIndex(dCnt)
                name = N.nvmlDeviceGetName(handle)

                try:
                    fan = N.nvmlDeviceGetFanSpeed(handle)
                except N.NVMLError as err:
                    fan = 'N/A'

                try:
                    temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
                except N.NVMLError as err:
                    temp = 'N/A'

                try:
                    powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000)
                except N.NVMLError as err:
                    powerUsage = 'N/A'

                try:
                    powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                except N.NVMLError as err:
                    powerLimit = 'N/A'

                try:
                    memInfo = N.nvmlDeviceGetMemoryInfo(handle)
                    memUsage = round(memInfo.used/1024/1024)
                    memTotal = round(memInfo.total/1024/1024)
                except N.NVMLError as err:
                    memUsage = 'N/A'
                    memTotal = 'N/A'

                try:
                    util = N.nvmlDeviceGetUtilizationRates(handle).gpu
                except N.NVMLError as err:
                    util = 'N/A'

                deviceInfoObj['NAME'] = name
                deviceInfoObj['FAN'] = fan
                deviceInfoObj['TEMP'] = temp
                deviceInfoObj['POWER_USAGE'] = powerUsage
                deviceInfoObj['POWER_LIMIT'] = powerLimit
                deviceInfoObj['MEM_USAGE'] = memUsage
                deviceInfoObj['MEM_TOTAL'] = memTotal
                deviceInfoObj['UTIL'] = util

                gpuProcessObj = {}
                try:
                    processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                except N.NVMLError as err:
                    processes = []
                for pCnt, process in enumerate(processes):
                    gpuMem = round(process.usedGpuMemory / 1024 / 1024)
                    pid = process.pid

                    try:
                        p = psutil.Process(pid)
                        attrs = p.as_dict(attrs = ['name', 'username', 'status'])
                    except psutil.ZombieProcess:
                        attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'}
                    except:
                        pass
                    
                    gpuProcessObj[str(pCnt)] = {
                        'PID': pid,
                        'MEM': gpuMem,
                        'NAME': attrs['name'],
                        'USERNAME': self._getSubuidName(attrs['username']),
                        'STATUS': attrs['status']
                    }

                deviceInfoObj['PROCESS'] = gpuProcessObj
                gpuInfoObj[str(dCnt)] = deviceInfoObj

            N.nvmlShutdown()

        except N.NVMLError as err:
            N.nvmlShutdown()
            print(err)
            gpuInfoObj = {}

        self._gpuInfoObj = gpuInfoObj
        return gpuInfoObj
Beispiel #18
0
def get_device_name(device_handle):
    """Get GPU device name."""
    try:
        return nativestr(pynvml.nvmlDeviceGetName(device_handle))
    except pynvml.NVMlError:
        return "NVIDIA"
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""

        def get_process_info(nv_process):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=nv_process.pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except:
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except:
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        index = N.nvmlDeviceGetIndex(handle)
        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'temperature.gpu': temperature,
            'utilization.gpu': utilization.gpu if utilization else None,
            'power.draw': int(power / 1000) if power is not None else None,
            'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None,
            # Convert bytes into MBytes
            'memory.used': int(memory.used / 1024 / 1024) if memory else None,
            'memory.total': int(memory.total / 1024 / 1024) if memory else None,
            'processes': processes,
        }
        return gpu_info
Beispiel #20
0
#!/usr/bin/env python3

# need package: py3nvml
# if you use python 2, you need nvidia-ml-py and change the import

from __future__ import print_function

# import pynvml
import py3nvml.py3nvml as pynvml
import datetime

pynvml.nvmlInit()
print("Driver Version:", pynvml.nvmlSystemGetDriverVersion())

deviceCount = pynvml.nvmlDeviceGetCount()

for i in range(deviceCount):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    print("Device {}: {}".format(i, pynvml.nvmlDeviceGetName(handle)))

pynvml.nvmlShutdown()
 def _get_device_name(gpu):
     return {'name': pynvml.nvmlDeviceGetName(gpu)}