Example #1
0
def test_1_visible_devices():
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    output = nvml.one_time()
    h = nvml._pynvml_handles()
    assert output["memory-total"] == pynvml.nvmlDeviceGetMemoryInfo(h).total
Example #2
0
def test_one_time():
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    output = nvml.one_time()
    assert "memory-total" in output
    assert "name" in output

    assert len(output["name"]) > 0
    def __init__(self, n=10000):
        self.proc = psutil.Process()

        self.time = deque(maxlen=n)
        self.cpu = deque(maxlen=n)
        self.memory = deque(maxlen=n)
        self.count = 0

        self.quantities = {
            "cpu": self.cpu,
            "memory": self.memory,
            "time": self.time
        }

        try:
            ioc = psutil.net_io_counters()
        except Exception:
            self._collect_net_io_counters = False
        else:
            self.last_time = time()
            self.read_bytes = deque(maxlen=n)
            self.write_bytes = deque(maxlen=n)
            self.quantities["read_bytes"] = self.read_bytes
            self.quantities["write_bytes"] = self.write_bytes
            self._last_io_counters = ioc
            self._collect_net_io_counters = True

        try:
            disk_ioc = psutil.disk_io_counters()
        except Exception:
            self._collect_disk_io_counters = False
        else:
            if disk_ioc is None:  # diskless machine
                self._collect_disk_io_counters = False
            else:
                self.last_time_disk = time()
                self.read_bytes_disk = deque(maxlen=n)
                self.write_bytes_disk = deque(maxlen=n)
                self.quantities["read_bytes_disk"] = self.read_bytes_disk
                self.quantities["write_bytes_disk"] = self.write_bytes_disk
                self._last_disk_io_counters = disk_ioc
                self._collect_disk_io_counters = True

        if not WINDOWS:
            self.num_fds = deque(maxlen=n)
            self.quantities["num_fds"] = self.num_fds

        if nvml.device_get_count() > 0:
            gpu_extra = nvml.one_time()
            self.gpu_name = gpu_extra["name"]
            self.gpu_memory_total = gpu_extra["memory-total"]
            self.gpu_utilization = deque(maxlen=n)
            self.gpu_memory_used = deque(maxlen=n)
            self.quantities["gpu_utilization"] = self.gpu_utilization
            self.quantities["gpu_memory_used"] = self.gpu_memory_used

        self.update()
Example #4
0
async def test_gpu_monitoring_range_query(s, a, b):
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    res = await s.get_worker_monitor_info()
    ms = ["gpu_utilization", "gpu_memory_used"]
    for w in (a, b):
        assert all(res[w.address]["range_query"][m] is not None for m in ms)
        assert res[w.address]["count"] is not None
        assert res[w.address]["last_time"] is not None
Example #5
0
async def test_gpu_metrics(s, a, b):
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    h = nvml._pynvml_handles()

    assert "gpu" in a.metrics
    assert (s.workers[a.address].metrics["gpu"]["memory-used"] ==
            pynvml.nvmlDeviceGetMemoryInfo(h).used)
    assert "gpu" in a.startup_information
    assert (s.workers[a.address].extra["gpu"]["name"] ==
            pynvml.nvmlDeviceGetName(h).decode())
Example #6
0
async def test_gpu_monitoring_recent(s, a, b):
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    h = nvml._pynvml_handles()
    res = await s.get_worker_monitor_info(recent=True)

    assert (res[a.address]["range_query"]["gpu_utilization"] ==
            pynvml.nvmlDeviceGetUtilizationRates(h).gpu)
    assert (res[a.address]["range_query"]["gpu_memory_used"] ==
            pynvml.nvmlDeviceGetMemoryInfo(h).used)
    assert res[a.address]["gpu_name"] == pynvml.nvmlDeviceGetName(h).decode()
    assert res[a.address][
        "gpu_memory_total"] == pynvml.nvmlDeviceGetMemoryInfo(h).total
Example #7
0
def test_2_visible_devices(CVD):
    if nvml.device_get_count() < 2:
        pytest.skip("Less than two GPUs available")

    os.environ["CUDA_VISIBLE_DEVICES"] = CVD
    idx = int(CVD.split(",")[0])

    h = nvml._pynvml_handles()
    h2 = pynvml.nvmlDeviceGetHandleByIndex(idx)

    s = pynvml.nvmlDeviceGetSerial(h)
    s2 = pynvml.nvmlDeviceGetSerial(h2)

    assert s == s2
Example #8
0
def test_has_cuda_context():
    if nvml.device_get_count() < 1:
        pytest.skip("No GPUs available")

    # This test should be run in a new process so that it definitely doesn't have a CUDA context
    # and uses a queue to pass exceptions back
    ctx = mp.get_context("spawn")
    queue = ctx.Queue()
    p = ctx.Process(target=run_has_cuda_context, args=(queue, ))
    p.start()
    p.join()  # this blocks until the process terminates
    e = queue.get()
    if e is not None:
        raise e
Example #9
0
    NumeralTickFormatter,
    OpenURL,
    TapTool,
)
from bokeh.plotting import figure
from tornado import escape

from dask.utils import format_bytes

from distributed.dashboard.components import DashboardComponent, add_periodic_callback
from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env
from distributed.dashboard.utils import update, without_property_validation
from distributed.diagnostics import nvml
from distributed.utils import log_errors

NVML_ENABLED = nvml.device_get_count() > 0


class GPUCurrentLoad(DashboardComponent):
    """How many tasks are on each worker"""
    def __init__(self, scheduler, width=600, **kwargs):
        with log_errors():
            self.last = 0
            self.scheduler = scheduler
            self.source = ColumnDataSource({
                "memory": [1, 2],
                "memory-half": [0.5, 1],
                "memory_text": ["1B", "2B"],
                "utilization": [1, 2],
                "utilization-half": [0.5, 1],
                "worker": ["a", "b"],
    def update(self):
        with self.proc.oneshot():
            cpu = self.proc.cpu_percent()
            memory = self.get_process_memory()
        now = time()

        self.cpu.append(cpu)
        self.memory.append(memory)
        self.time.append(now)
        self.count += 1

        result = {
            "cpu": cpu,
            "memory": memory,
            "time": now,
            "count": self.count
        }

        if self._collect_net_io_counters:
            try:
                ioc = psutil.net_io_counters()
            except Exception:
                pass
            else:
                last = self._last_io_counters
                duration = now - self.last_time
                read_bytes = (ioc.bytes_recv - last.bytes_recv) / (duration
                                                                   or 0.5)
                write_bytes = (ioc.bytes_sent - last.bytes_sent) / (duration
                                                                    or 0.5)
                self.last_time = now
                self._last_io_counters = ioc
                self.read_bytes.append(read_bytes)
                self.write_bytes.append(write_bytes)
                result["read_bytes"] = read_bytes
                result["write_bytes"] = write_bytes

        if self._collect_disk_io_counters:
            try:
                disk_ioc = psutil.disk_io_counters()
            except Exception:
                pass
            else:
                last_disk = self._last_disk_io_counters
                duration_disk = now - self.last_time_disk
                read_bytes_disk = (disk_ioc.read_bytes - last_disk.read_bytes
                                   ) / (duration_disk or 0.5)
                write_bytes_disk = (disk_ioc.write_bytes -
                                    last_disk.write_bytes) / (duration_disk
                                                              or 0.5)
                self.last_time_disk = now
                self._last_disk_io_counters = disk_ioc
                self.read_bytes_disk.append(read_bytes_disk)
                self.write_bytes_disk.append(write_bytes_disk)
                result["read_bytes_disk"] = read_bytes_disk
                result["write_bytes_disk"] = write_bytes_disk

        if not WINDOWS:
            num_fds = self.proc.num_fds()
            self.num_fds.append(num_fds)
            result["num_fds"] = num_fds

        if nvml.device_get_count() > 0:
            gpu_metrics = nvml.real_time()
            self.gpu_utilization.append(gpu_metrics["utilization"])
            self.gpu_memory_used.append(gpu_metrics["memory-used"])
            result["gpu_utilization"] = gpu_metrics["utilization"]
            result["gpu_memory_used"] = gpu_metrics["memory-used"]

        return result