Beispiel #1
0
def test_nvml_nvlink_properties(ngpus, handles):
    for i in range(ngpus):
        for j in range(pynvml.NVML_NVLINK_MAX_LINKS):
            version = pynvml.nvmlDeviceGetNvLinkVersion(handles[i], j)
            assert version >= 1
            state = pynvml.nvmlDeviceGetNvLinkState(handles[i], j)
            assert state >= 0
            pci_info = pynvml.nvmlDeviceGetNvLinkRemotePciInfo(handles[i], j)
            assert isinstance(pci_info, pynvml.c_nvmlPciInfo_t)
Beispiel #2
0
def test_nvml_nvlink_properties(ngpus, handles, driver):

    if driver > 450.0:
        pytest.xfail(XFAIL_LEGACY_NVLINK_MSG)

    for i in range(ngpus):
        for j in range(pynvml.NVML_NVLINK_MAX_LINKS):
            version = pynvml.nvmlDeviceGetNvLinkVersion(handles[i], j)
            assert version >= 1
            state = pynvml.nvmlDeviceGetNvLinkState(handles[i], j)
            assert state >= 0
            pci_info = pynvml.nvmlDeviceGetNvLinkRemotePciInfo(handles[i], j)
            assert isinstance(pci_info, pynvml.c_nvmlPciInfo_t)
Beispiel #3
0
def nvlink(doc):

    import subprocess as sp

    # Use device-0/link-0 to get "upper bound"
    counter = 1
    nlinks = pynvml.NVML_NVLINK_MAX_LINKS
    nvlink_ver = pynvml.nvmlDeviceGetNvLinkVersion(gpu_handles[0], 0)
    nvlink_link_bw = {
        # Keys = NVLink Version, Values = Max Link BW (per direction)
        # [Note: Using specs at https://en.wikichip.org/wiki/nvidia/nvlink]
        1: 20.0 * GB,  # GB/s
        2: 25.0 * GB,  # GB/s
    }
    # Max NVLink Throughput = BW-per-link * nlinks
    max_bw = nlinks * nvlink_link_bw.get(nvlink_ver, 25.0 * GB)

    # nvmlDeviceSetNvLinkUtilizationControl seems limited, using smi:
    sp.call([
        "nvidia-smi",
        "nvlink",
        "--setcontrol",
        str(counter) + "bz",  # Get output in bytes
    ])

    tx_fig = figure(title="TX NVLink [B/s]",
                    sizing_mode="stretch_both",
                    y_range=[0, max_bw])
    tx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b")
    nvlink_state = {}
    nvlink_state["tx"] = [
        sum([
            pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
                gpu_handles[i], j, counter)["tx"] for j in range(nlinks)
        ]) for i in range(ngpus)
    ]
    nvlink_state["tx-ref"] = nvlink_state["tx"].copy()
    left = list(range(ngpus))
    right = [l + 0.8 for l in left]
    source = ColumnDataSource({
        "left": left,
        "right": right,
        "count-tx": [0.0 for i in range(ngpus)],
        "count-rx": [0.0 for i in range(ngpus)],
    })
    mapper = LinearColorMapper(palette=all_palettes["RdYlBu"][4],
                               low=0,
                               high=max_bw)

    tx_fig.quad(
        source=source,
        left="left",
        right="right",
        bottom=0,
        top="count-tx",
        color={
            "field": "count-tx",
            "transform": mapper
        },
    )
    tx_fig.toolbar_location = None

    rx_fig = figure(title="RX NVLink [B/s]",
                    sizing_mode="stretch_both",
                    y_range=[0, max_bw])
    rx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b")
    nvlink_state["rx"] = [
        sum([
            pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
                gpu_handles[i], j, counter)["rx"] for j in range(nlinks)
        ]) for i in range(ngpus)
    ]
    nvlink_state["rx-ref"] = nvlink_state["rx"].copy()

    rx_fig.quad(
        source=source,
        left="left",
        right="right",
        bottom=0,
        top="count-rx",
        color={
            "field": "count-rx",
            "transform": mapper
        },
    )
    rx_fig.toolbar_location = None

    doc.title = "NVLink Utilization Counters"
    doc.add_root(column(tx_fig, rx_fig, sizing_mode="stretch_both"))

    def cb():
        nvlink_state["tx-ref"] = nvlink_state["tx"].copy()
        nvlink_state["rx-ref"] = nvlink_state["rx"].copy()
        src_dict = {}
        nvlink_state["tx"] = [
            sum([
                pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
                    gpu_handles[i], j, counter)["tx"] for j in range(nlinks)
            ]) for i in range(ngpus)
        ]
        nvlink_state["rx"] = [
            sum([
                pynvml.nvmlDeviceGetNvLinkUtilizationCounter(
                    gpu_handles[i], j, counter)["rx"] for j in range(nlinks)
            ]) for i in range(ngpus)
        ]
        src_dict["count-tx"] = [
            max(a - b, 0.0) * 5.0
            for (a, b) in zip(nvlink_state["tx"], nvlink_state["tx-ref"])
        ]
        src_dict["count-rx"] = [
            max(a - b, 0.0) * 5.0
            for (a, b) in zip(nvlink_state["rx"], nvlink_state["rx-ref"])
        ]

        source.data.update(src_dict)

    doc.add_periodic_callback(cb, 200)
Beispiel #4
0
from jupyterlab_nvdashboard.utils import format_bytes

KB = 1e3
MB = KB * KB
GB = MB * KB

try:
    pynvml.nvmlInit()
except pynvml.nvml.NVMLError_LibraryNotFound as error:
    ngpus = 0
    gpu_handles = []
else:
    ngpus = pynvml.nvmlDeviceGetCount()
    gpu_handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(ngpus)]
    try:
        nvlink_ver = pynvml.nvmlDeviceGetNvLinkVersion(gpu_handles[0], 0)
    except (IndexError, pynvml.nvml.NVMLError_NotSupported):
        nvlink_ver = None
    try:
        pci_gen = pynvml.nvmlDeviceGetMaxPcieLinkGeneration(gpu_handles[0])
    except (IndexError, pynvml.nvml.NVMLError_NotSupported):
        pci_gen = None


def gpu(doc):
    fig = figure(title="GPU Utilization",
                 sizing_mode="stretch_both",
                 x_range=[0, 100])

    def get_utilization():
        return [